1 Codigo Reproducible: Preparación y Limpieza de Datos - Recalibración de modelo KFRE para predecir falla renal en asegurados de EsSalud

Author

Percy Soto Becerra

0.1 Cargar paquetes

if (!require("pacman")) {
  install.packages("pacman")
}
Loading required package: pacman
pacman::p_unload(all)
The following packages have been unloaded:
pacman
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

if (!require("survcomp")) 
  BiocManager::install("survcomp")
Loading required package: survcomp
Loading required package: survival
Loading required package: prodlim
if (!require("pacman")) {
  install.packages("pacman")
}
Loading required package: pacman
pacman::p_load(
  here,
  tidyverse,
  rio,
  lubridate, 
  janitor, 
  naniar, 
  summarytools, 
  labelled, 
  survcomp
)

0.2 Importar datos

0.2.1 Datos de hemodiálisis

0.2.1.1 Datos de CNSR:

Vamos a transformar el nombre de las variables:

data_dial_sig <- import(here("Data", "Raw", "25082023_bd_dial_sig.csv"))
head(data_dial_sig, 10)
data_dial_sig2 <- data_dial_sig |> 
  rename(auto_cor = AUTOGENERADO, 
         dial_date = `FECHA DE 1ERA HEMODIALISIS`) |> 
  mutate(auto_cor = str_trim(auto_cor), 
         dial_date = dmy(dial_date)) |> 
  select(auto_cor, dial_date)

head(data_dial_sig2)
data_dial_sscc <- import(here("Data", "Raw", "25082023_bd_dial_sscc.csv"))
head(data_dial_sscc, 10)
data_dial_sscc 
data_dial_sscc2 <- data_dial_sscc |> 
  rename(auto_cor = AUTOGENERADO, 
         dial_date = `FECHA 1ERA HEMODIALISIS`) |> 
  mutate(auto_cor = str_trim(auto_cor), 
         dial_date = dmy(dial_date)) |> 
  select(auto_cor, dial_date)

head(data_dial_sscc2)

Por ultimo, fusionamos ambos datos:

data_dial_cnsr <- data_dial_sscc2 |> 
  bind_rows(data_dial_sig2)

head(data_dial_cnsr)

Asimismo, vamos a agregarle una columna donde figure la variable indicadora de dialisis:

data_dial_cnsr <- data_dial_cnsr |> 
  mutate(dial = case_when(is.na(dial_date) ~ 0, 
                          !is.na(dial_date) ~ 1, 
                          TRUE ~ 0), 
         dial_date = case_when(!is.na(dial_date) ~ dial_date, 
                               is.na(dial_date) ~ dmy("25-08-2023")))
data_dial_cnsr |> 
  head(10)

Vamos a ordernar los datos por dialisis (descendente) y eliminar duplicados:

data_dial_cnsr |> 
  arrange(auto_cor, desc(dial)) |> 
  get_dupes(auto_cor)
data_dial_cnsr <- data_dial_cnsr |> 
  arrange(auto_cor, desc(dial)) |> 
  distinct(auto_cor, .keep_all = TRUE)
data_dial_cnsr |> 
  arrange(auto_cor, desc(dial)) |> 
  get_dupes(auto_cor)
No duplicate combinations found of: auto_cor

0.2.1.2 Datos basados en SGH:

Vamos a usar como proxy los CIE-10 para detectar el acceso a hemodialisis: - https://doi.org/10.1093/ckj/sfx085 - https://icd.who.int/browse10/2019/en#/Z80-Z99

En base a la tabla de líneas arriba, vamos a detectar los siguientes CIE-10 como si fueran evidencia de dialisis: - Z99.2 (Dependence on renal dialysis) - Z49.1 (Extracorporeal dialysis) - Z49.2 (Other dialysis - peritoneal dyalisis) - N18.5 (CKD Stage 5) - N18.6 (CKD con dialisis) - Z94.0 (Kidney transplant status)

data_dial_sgh_emg <- import(here("Data", "Raw", "07092023_bd_dial_sgh_emerg.csv")) |> 
  mutate(ESCDIAG = str_trim(ESCDIAG), 
         ESDIAG2 = str_trim(ESDIAG2)) |> 
  select(ESAUTASE, ESFECHA, ESCDIAG, ESDIAG2) |> 
  rename(ESDIAG1 = ESCDIAG)

Transformaremos los datos a formato largo:

data_dial_sgh_emg <- data_dial_sgh_emg |> 
  pivot_longer(cols = c(ESDIAG1, ESDIAG2), 
               names_to = "diag_num", 
               values_to = "diag_cie")
head(data_dial_sgh_emg, 10)

Vamos a filtrar a los pacientes con los diagnosticos CIE-10 de interés:

data_dial_sgh_emg2 <- data_dial_sgh_emg |> 
  filter(diag_cie %in% c("Z99.2", "Z49.1", "Z49.2", "N18.5", "N18.6", "Z94.0"))

head(data_dial_sgh_emg2, 10)

Por último armonizaremos los nombres de las variables para permitir la fusión de los datos:

data_dial_sgh_emg2 <- data_dial_sgh_emg2 |> 
  rename(auto_cor = ESAUTASE, 
         dial_date = ESFECHA) |> 
  mutate(auto_cor = str_trim(auto_cor), 
         dial_date = dmy(dial_date)) |> 
  select(auto_cor, dial_date, diag_cie)

head(data_dial_sgh_emg2, 10)

Vamos a ordenar los datos de menor a mayor fecha (más antiguo a más reciente) por cada individuo (auto_cor):

data_dial_sgh_emg2 <- data_dial_sgh_emg2 |> 
  arrange(auto_cor, dial_date) 

head(data_dial_sgh_emg2, 10)

Ahora, vamos a eliminar los duplicados de auto_cor para así quedarns con la fecha más temprana de inicio probable de diálisis:

data_dial_sgh_emg2 <- data_dial_sgh_emg2 |> 
  distinct(auto_cor, .keep_all = TRUE)

head(data_dial_sgh_emg2, 10)

Ahora importaremos los datos de hospitalizados y haremos una armonización de los nombres de variable para permitir la fusión de los datos:

data_dial_sgh_hosp <- import(here("Data", "Raw", "07092023_bd_dial_sgh_hosp.csv")) |> 
  mutate(HOCDIAGI = str_trim(HOCDIAGI)) |> 
  rename(diag_cie = HOCDIAGI, 
         dial_date = HOFHOSP, 
         auto_cor = HOCAUTO) |> 
  mutate(auto_cor = str_trim(auto_cor), 
         dial_date = dmy(dial_date)) |> 
  select(auto_cor, dial_date, diag_cie)

head(data_dial_sgh_hosp, 10)

Luego vamos a filtrar

data_dial_sgh_hosp2 <- data_dial_sgh_hosp |> 
  filter(diag_cie %in% c("Z99.2", "Z49.1", "Z49.2", "N18.5", "N18.6", "Z94.0"))

Luego vamos a proceder a hacer una fusión por fila:

data_dial_sgh <- data_dial_sgh_hosp2 |> 
  bind_rows(data_dial_sgh_emg2)
data_dial_sgh |> 
  head(10)

Asimismo, vamos a agregarle una columna donde figure la variable indicadora de dialisis:

data_dial_sgh <- data_dial_sgh |> 
  mutate(dial = case_when(is.na(dial_date) ~ 0, 
                          !is.na(dial_date) ~ 1, 
                          TRUE ~ 0), 
         dial_date = case_when(!is.na(dial_date) ~ dial_date, 
                               is.na(dial_date) ~ dmy("07-09-2023")))
data_dial_sgh |> 
  head(10)

0.2.1.3 Fusion de datos CNSR + SGH

data_dial <- data_dial_sgh |> 
  bind_rows(data_dial_cnsr)

head(data_dial, 10)
dim(data_dial)
[1] 47126     4

Por cada individuo (autogenerado), vamos ahora a ordenar de menor fecha a mayor fecha y quedarnos solo con el de menor fecha (primera fila) eliminado los demás duplicados de auto_cor:

data_dial2 <- data_dial |> 
  arrange(auto_cor, desc(dial), dial_date) |> 
  distinct(auto_cor, .keep_all = TRUE)

head(data_dial2)

0.2.2 Datos de fallecimiento

data_fallec <- import(here("Data", "Raw", "02082023_bd_fallec.csv")) |> 
  clean_names()

data_fallec |> 
  head(10)
data_fallec <- data_fallec |> 
  rename(auto_cor = autogenerado, 
         death_date = dgaffal, 
         dni = dgandid) |> 
  mutate(auto_cor = str_trim(auto_cor), 
         death_date = dmy(death_date), 
         fuente = "data_fallec") |> 
  select(auto_cor, death_date, dni, fuente)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `death_date = dmy(death_date)`.
Caused by warning:
!  133735 failed to parse.
data_fallec |> 
  head(10)

0.2.3 Datos del hospital rebagliati

0.2.3.1 Importar y procesar datos

data_pred_reba <- import(here("Data", "Raw", "bd_predictores_reba_lima.csv")) |> 
  clean_names() 
data_pred_reba |> 
  head(10)
data_pred_reba2 <- data_pred_reba |> 
  rename(sex = sexo, 
         age = edad, 
         auto_cor = autogenerado, 
         assess_date = fecha_erc, 
         crea = creatinina, 
         urine_album = albuminuria, 
         urine_crea = creatinuria, 
         acr = rac, 
         cas_hosp = cas) |> 
  mutate(sex = factor(sex, 
                      levels = c("0", "1"), 
                      labels = c("Femenino", "Masculino")), 
         across(c(age, crea, urine_album, urine_crea, acr), ~ as.numeric(str_replace(., ",", "."))), 
         hta = as.numeric(hta), 
         dm = as.numeric(dm), 
         assess_date = dmy(assess_date),
         auto_cor = str_trim(auto_cor), 
         cas = "REBAGLIATI", 
         eGFR_ckdepi = case_when(
           crea <= 0.7 & sex == "Femenino" ~ 144 * ((crea / 0.7) ^ (-0.329)) * (0.993 ^ (age)) * 1, 
           crea > 0.7 & sex == "Femenino" ~ 144 * ((crea / 0.7) ^ (-1.209)) * (0.993 ^ (age)) * 1, 
           crea <= 0.9 & sex == "Masculino" ~ 141 * ((crea / 0.9) ^ (-0.411)) * (0.993 ^ (age)) * 1, 
           crea > 0.9 & sex == "Masculino" ~ 141 * ((crea / 0.9) ^ (-1.209)) * (0.993 ^ (age)) * 1, 
           TRUE ~ as.numeric(NA)
         )) |> 
  select(auto_cor, sex, age, hta, dm, cas, cas_hosp, assess_date, crea, eGFR_ckdepi, urine_album, urine_crea, acr)
Warning: There were 3 warnings in `mutate()`.
The first warning was:
ℹ In argument: `across(...)`.
Caused by warning:
! NAs introducidos por coerción
ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.

0.2.3.2 Inspeccionar datos perdidos

skimr::skim(data_pred_reba2)
Data summary
Name data_pred_reba2
Number of rows 13932
Number of columns 13
_______________________
Column type frequency:
character 3
Date 1
factor 1
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
auto_cor 0 1 13 15 0 13860 0
cas 0 1 10 10 0 1 0
cas_hosp 0 1 10 36 0 17 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
assess_date 0 1 2013-01-02 2022-12-29 2016-06-07 2510

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sex 3600 0.74 FALSE 2 Mas: 8248, Fem: 2084

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 1 1.00 78.33 26.11 12.0 71.00 79.00 86.10 2023.00 ▇▁▁▁▁
hta 4767 0.66 0.84 0.37 0.0 1.00 1.00 1.00 1.00 ▂▁▁▁▇
dm 7830 0.44 0.55 0.50 0.0 0.00 1.00 1.00 1.00 ▆▁▁▁▇
crea 0 1.00 1.56 0.55 0.9 1.23 1.40 1.71 4.95 ▇▂▁▁▁
eGFR_ckdepi 3600 0.74 39.55 11.95 0.0 30.26 41.95 49.53 62.50 ▁▃▅▇▅
urine_album 2656 0.81 16.28 149.08 0.0 0.30 1.00 4.10 7445.00 ▇▁▁▁▁
urine_crea 1918 0.86 73.03 53.44 0.1 42.90 64.70 86.50 3235.00 ▇▁▁▁▁
acr 2796 0.80 368.94 4486.38 0.0 4.63 14.23 65.09 195178.57 ▇▁▁▁▁
gg_miss_upset(data_pred_reba2)

gg_miss_upset(data_pred_reba2 |> select(eGFR_ckdepi, acr))
`geom_line()`: Each group consists of only one observation.
ℹ Do you need to adjust the group aesthetic?

data_pred_reba3 <- data_pred_reba2 |> 
  drop_na(eGFR_ckdepi, acr)

0.2.3.3 Fusionar datos de dialisis y mortalidad con datos de Rebagliati

data_pred_reba3 <- data_pred_reba2 |> 
  left_join(data_fallec, by = "auto_cor") |> 
  left_join(data_dial2, by = "auto_cor")

head(data_pred_reba3, 10)
data_pred_reba3 <- data_pred_reba3  |> 
  mutate(death = case_when(!is.na(death_date) ~ 1, 
                           is.na(death_date) ~ 0, 
                           TRUE ~ 0), 
         death_date = case_when(is.na(death_date ) ~ dmy("02-08-2023"), 
                                TRUE ~ death_date), 
         deathc = case_when(
           death == 0 | is.na(death) ~ 0, 
           death == 1 & death_date <= as.Date("2022-12-31") ~ 1, 
           death == 1 & death_date > as.Date("2022-12-31") ~ 0, 
           TRUE ~ as.numeric(NA)
         ),
         ddeathc = case_when(
           death_date <= as.Date("2022-12-31") ~ death_date, 
           death_date > as.Date("2022-12-31") | is.na(death_date) ~ as.Date("2022-12-31"),
           TRUE ~ as.Date(NA)
         ), 
         dial = case_when(!is.na(dial) ~ dial, 
                          is.na(dial) ~ 0, 
                          TRUE ~ as.numeric(NA)), 
         dial_date = case_when(!is.na(dial_date) ~ dial_date,
                               is.na(dial_date) ~ dmy("07-09-2023")))
data_pred_reba3  |> 
  head(10)
data_pred_reba4 <- data_pred_reba3 |> 
  mutate(
    male = if_else(sex == "Masculino", 1, 0), 
    risk2y = 1 - 0.9832 ^ exp(-0.2201 * (age / 10 - 7.036) + 0.2467 * (male - 0.5642) - 0.5567 * (eGFR_ckdepi / 5 - 7.222) + 0.4510 * (log(acr) - 5.137)), 
    risk5y = 1 - 0.9365 ^ exp(-0.2201 * (age / 10 - 7.036) + 0.2467 * (male - 0.5642) - 0.5567 * (eGFR_ckdepi / 5 - 7.222) + 0.4510 * (log(acr) - 5.137)), 
    pi2y = -0.2201 * (age / 10 - 7.036) + 0.2467 * (male - 0.5642) - 0.5567 * (eGFR_ckdepi / 5 - 7.222) + 0.4510 * (log(acr) - 5.137), 
    pi5y = -0.2201 * (age / 10 - 7.036) + 0.2467 * (male - 0.5642) - 0.5567 * (eGFR_ckdepi / 5 - 7.222) + 0.4510 * (log(acr) - 5.137), 
    dial_time = as.duration(assess_date %--% dial_date) / ddays(1), 
    death_time = as.duration(assess_date %--% death_date) / ddays(1), 
    dialc = case_when(
      dial == 0 | is.na(dial) ~ 0, 
      dial == 1 & dial_date <= as.Date("2022-12-31") ~ 1, 
      dial == 1 & dial_date > as.Date("2022-12-31") ~ 0,
      TRUE ~ as.numeric(NA)
    ), 
    ddialc = case_when(
      dial_date <= as.Date("2022-12-31") ~ dial_date, 
      dial_date > as.Date("2022-12-31") ~ as.Date("2022-12-31"),
      is.na(dial_date) & deathc == 1 ~ ddeathc, 
      is.na(dial_date) & deathc == 0 ~ as.Date("2022-12-31"), 
      TRUE ~ as.Date(NA)
    ), 
    tdeathc = as.duration(assess_date %--% ddeathc) / dyears(1), 
    tdialc = as.duration(assess_date %--% ddialc) / dyears(1), 
    status_num = case_when(
      dialc == 0 & deathc == 0 ~ 0, 
      dialc == 1 & deathc == 0 ~ 1, #< Evento de interes: dialisis
      dialc == 0 & deathc == 1 & death_time >= 0~ 2, #< Evento en competencia (muerte antes de dialisis)
      dialc == 1 & deathc == 1 & (tdialc <= tdeathc) ~ 1, 
      TRUE ~ as.numeric(NA)
    ), 
    status_num2 = factor(status_num, levels = c(0, 1, 2), 
                         labels = c("Alive w/o Kidney Failure", 
                                    "Kidney Failure", 
                                    "Death w/o Kidney Failure")), 
    time = case_when(
      status_num == 0 ~ tdialc, 
      status_num == 1 ~ tdialc, 
      status_num == 2 ~ tdeathc, 
      TRUE ~ as.numeric(NA)
    ), 
    status_num = as.integer(status_num), 
    grf_cat = case_when(
      eGFR_ckdepi > 90 ~ "G1", 
      eGFR_ckdepi >= 60 & eGFR_ckdepi <= 90 ~ "G2", 
      eGFR_ckdepi >= 45 & eGFR_ckdepi < 60 ~ "G3a", 
      eGFR_ckdepi >= 30 & eGFR_ckdepi < 45 ~ "G3b", 
      eGFR_ckdepi >= 15 & eGFR_ckdepi < 30 ~ "G4", 
      eGFR_ckdepi < 15 ~ "G5", 
      TRUE ~ as.character(NA)
    ), 
    acr2 = urine_album / urine_crea, 
    acr_cat = case_when(
      acr < 30 ~ "A1", 
      acr >= 30 & acr <= 300 ~ "A2", 
      acr > 300 ~ "A3",
      TRUE ~ as.character(NA)
    ), 
    ckd_class = case_when(
      grf_cat %in% c("G1", "G2") & acr_cat == "A1" ~ "Low risk", 
      (grf_cat %in% c("G3a") & acr_cat == "A1") | 
        (grf_cat %in% c("G1", "G2") & acr_cat == "A2") ~ "Moderately increased risk", 
      (grf_cat %in% c("G3b") & acr_cat == "A1") | 
        (grf_cat == "G3a" & acr_cat == "A2") | 
        (grf_cat %in% c("G1", "G2") & acr_cat == "A3") ~ "High risk", 
      (grf_cat %in% c("G4", "G5") & acr_cat == "A1") | 
        (grf_cat %in% c("G3b", "G4", "G5") & acr_cat == "A2") | 
        (grf_cat %in% c("G3a", "G3b", "G4", "G5") & acr_cat == "A3") ~ "Very high risk"
    ), 
    grf_cat = factor(grf_cat, levels = c("G1", "G2", "G3a", "G3b", "G4", "G5")), 
    acr_cat = factor(acr_cat, levels = c("A1", "A2", "A3")), 
    ckd_stage = case_when(
      grf_cat %in% c("G3a", "G3b", "G4") ~ "Stages 3-4", 
      grf_cat %in% c("G1", "G2", "G5") ~ "Stages 1-2 y 5"
    ), 
    ckd_stage = factor(ckd_stage, levels = c("Stages 1-2 y 5", "Stages 3-4")), 
    ckd_stage2 = case_when(
      grf_cat %in% c("G3b", "G4") ~ "Stages 3b-4", 
      grf_cat %in% c("G3a", "G5", "G1", "G2") ~ "Stages 1-3 y 5"
    ), 
    ckd_stage2 = factor(ckd_stage2, levels = c("Stages 1-3 y 5", "Stages 3b-4")), 
    ckd_class = factor(ckd_class, 
                       levels = c("Low risk", 
                                  "Moderately increased risk", 
                                  "High risk", 
                                  "Very high risk")), 
    ckd_class2 = case_when(
      ckd_class %in% c("Low risk", "Moderately increased risk", 
                       "High risk") ~ "Moderately/High risk", 
      ckd_class == "Very high risk" ~ "Very high risk", 
      TRUE ~ as.character(NA)
    ), 
    ckd_class2 = factor(ckd_class2, 
                        levels = c("Moderately/High risk", "Very high risk")), 
    across(where(is.factor), ~droplevels(.)), 
    total = 1, 
    # Censoring to 5 years----
    eventd = case_when(
      status_num2 == "Alive w/o Kidney Failure" ~ 0, 
      status_num2 == "Kidney Failure" ~ 1, 
      status_num2 == "Death w/o Kidney Failure" ~ 2, 
      TRUE ~ as.numeric(NA)
    ), 
    event = case_when(
      status_num2 %in% c("Alive w/o Kidney Failure", "Death w/o Kidney Failure") ~ 0, 
      status_num2 %in% c("Kidney Failure") ~ 1, 
      TRUE ~ as.numeric(NA)
    ),
    time_death5y = censor.time(time, deathc, time.cens = 5)$surv.time.cens, 
    death5y = censor.time(time, deathc, time.cens = 5)$surv.event.cens, 
    time_death2y = censor.time(time, deathc, time.cens = 2)$surv.time.cens, 
    death2y = censor.time(time, deathc, time.cens = 2)$surv.event.cens, 
    time5y = censor.time(time, event, time.cens = 5)$surv.time.cens, 
    event5y = censor.time(time, event, time.cens = 5)$surv.event.cens, 
    eventd5y = censor.time(time, eventd, time.cens = 5)$surv.event.cens, 
    eventd5ylab = case_when(
      eventd5y == 0 ~ "Alive w/o Kidney Failure", 
      eventd5y == 1 ~ "Kidney Failure", 
      eventd5y == 2 ~ "Death w/o Kidney Failure", 
      TRUE ~ as.character(NA)
    ), 
    time2y = censor.time(time, event, time.cens = 2)$surv.time.cens, 
    event2y = censor.time(time, event, time.cens = 2)$surv.event.cens, 
    eventd2y = censor.time(time, eventd, time.cens = 2)$surv.event.cens, 
    eventd2ylab = case_when(
      eventd2y == 0 ~ "Alive w/o Kidney Failure", 
      eventd2y == 1 ~ "Kidney Failure", 
      eventd2y == 2 ~ "Death w/o Kidney Failure", 
      TRUE ~ as.character(NA)
    )
  ) |> 
  set_variable_labels(
    cas = "Centro de atención de salud", 
    sex = "Sexo", 
    male = "Sexo, masculino", 
    age = "Edad (años)", 
    assess_date = "Fecha de evaluación", 
    crea = "Creatinina sérica (mg/dL)", 
    eGFR_ckdepi = "TFG usando CKD-EPI, ml/min/1.73m2", 
    acr = "Relación albúmina-creatinina, mg/g", 
    urine_album = "Albúmina en orina (mg/ml)", 
    urine_crea = "Creatinina en orina (mg/dl)", 
    death_date = "Fecha de defunción", 
    dial_date = "Fecha de hemodiálisis", 
    hta = "Hipertensión", 
    dm = "Diabetes Mellitus", 
    risk2y = "Riesgo pronosticado de fallo renal a 2 años", 
    risk5y = "Riesgo pronosticado de fallo renal a 5 años", 
    pi2y = "Índice pronóstico de fallo renal a 2 años", 
    pi5y = "Índice pronóstico de fallo renal a 5 años",     
    grf_cat = "Categorías de TFG", 
    acr_cat = "Categorías de albuminuria persistente", 
    ckd_class = "Clasificación CKD KDIGO", 
    ckd_class2 = "Clasificación CKD KDIGO",
    ckd_stage = "Etapas de CKD", 
    ckd_stage2 = "Etapas de CKD", 
    status_num = "Resultado", 
    status_num2 = "Resultado", 
    eventd5ylab = "Resultado a 5 años", 
    eventd2ylab = "Resultado a 2 años", 
    eventd5y = "Resultado a 5 años", 
    deathc = "Defunción",
    death5y = "Defunción a 5 años", 
    death2y = "Defunción a 2 años",
    time_death5y = "Tiempo hasta muerte a 5 años", 
    dialc = "Fallo renal", 
    total = "Total", 
    grf_cat = "Categorias de GFR", 
    ckd_class = "Clasificación de CKD KDIGO") 
export(data_pred_reba4, here("Data", "Tidy", "data_reba.rds"))

0.2.4 Datos del hospital kaelin

0.2.4.1 Importar y procesar datos

data_pred_kaelin <- import(here("Data", "Raw", "bd_predictores_kaelin_lima.xlsx")) |> 
  clean_names() 
Warning: Expecting numeric in C1724 / R1724C3: got '1247831 '
Warning: Coercing text to numeric in L1724 / R1724C12: '1.4'
Warning: Coercing text to numeric in N1724 / R1724C14: '121.2'
Warning: Coercing text to numeric in O1724 / R1724C15: '6.8'
Warning: Coercing text to numeric in L1728 / R1728C12: '1.06'
Warning: Coercing text to numeric in N1728 / R1728C14: '4.03'
Warning: Coercing text to numeric in L1729 / R1729C12: '1.63'
Warning: Coercing text to numeric in N1729 / R1729C14: '446.14'
Warning: Coercing text to numeric in L1730 / R1730C12: '3.91'
New names:
• `` -> `...7`
• `` -> `...8`
data_pred_kaelin |> 
  head(10)

0.2.4.2 Limpiar variables

data_pred_kaelin2 <- data_pred_kaelin |> 
  rename(sex = sexo, 
         age = edad, 
         auto_cor = autogenerado, 
         assess_date = fecha, 
         crea = creatinina, 
         urine_album = albuminuria, 
         urine_crea = crea_orina, 
         acr = rac) |> 
  mutate(sex = factor(sex, 
                      levels = c("F", "M"), 
                      labels = c("Femenino", "Masculino")), 
         hta = as.numeric(hta), 
         dm = as.numeric(dm), 
         assess_date = as.Date(assess_date),
         auto_cor = str_trim(auto_cor), 
         cas = "KAELIN", 
         eGFR_ckdepi = case_when(
           crea <= 0.7 & sex == "Femenino" ~ 144 * ((crea / 0.7) ^ (-0.329)) * (0.993 ^ (age)) * 1, 
           crea > 0.7 & sex == "Femenino" ~ 144 * ((crea / 0.7) ^ (-1.209)) * (0.993 ^ (age)) * 1, 
           crea <= 0.9 & sex == "Masculino" ~ 141 * ((crea / 0.9) ^ (-0.411)) * (0.993 ^ (age)) * 1, 
           crea > 0.9 & sex == "Masculino" ~ 141 * ((crea / 0.9) ^ (-1.209)) * (0.993 ^ (age)) * 1, 
           TRUE ~ as.numeric(NA)
         ), 
         cas_hosp = "Hosp. Kaelin", 
         dni = dni_completo,
         dial_date = as.Date(fecha_inicio_hd)) |> 
  select(auto_cor, dni, sex, age, hta, dm, cas, cas_hosp, assess_date, crea, eGFR_ckdepi, urine_album, urine_crea, acr, dial_date)

0.2.4.3 Fusionar datos de muerte de sinadef

data_fallec_sinadef <- import(here("Data", "Raw", "sinadef2017_2023.csv")) |> 
  clean_names() |> 
  select(tipo_doc, documento, fallecido, fech_fallecimiento) |> 
  mutate(fuente = "data_fallec_sinadef") |> 
  filter(tipo_doc == "DNI/LE")

head(data_fallec_sinadef, 10)
data_pred_kaelin3 <- data_pred_kaelin2 |> 
  left_join(data_fallec_sinadef, by = c("dni" = "documento"))
data_pred_kaelin3 |> 
  count(fuente)
data_pred_kaelin3 |> 
  count(fallecido)

0.2.4.4 Inspeccionar datos perdidos

gg_miss_upset(data_pred_kaelin3)

0.2.4.5 Fusionar datos de dialisis y mortalidad con datos de Rebagliati

data_pred_kaelin3 <- data_pred_kaelin3  |> 
  mutate(death_date = as.Date(fech_fallecimiento)) |> 
  mutate(death = case_when(!is.na(death_date) ~ 1, 
                           is.na(death_date) ~ 0, 
                           TRUE ~ 0), 
         death_date = case_when(is.na(death_date ) ~ dmy("04-05-2023"), 
                                TRUE ~ death_date), 
         deathc = case_when(
           death == 0 | is.na(death) ~ 0, 
           death == 1 & death_date <= as.Date("2022-12-31") ~ 1, 
           death == 1 & death_date > as.Date("2022-12-31") ~ 0, 
           TRUE ~ as.numeric(NA)
         ),
         ddeathc = case_when(
           death_date <= as.Date("2022-12-31") ~ death_date, 
           death_date > as.Date("2022-12-31") | is.na(death_date) ~ as.Date("2022-12-31"),
           TRUE ~ as.Date(NA)
         ), 
         dial = case_when(!is.na(dial_date) ~ 1, 
                          is.na(dial_date) ~ 0, 
                          TRUE ~ as.numeric(NA)), 
         dial_date = case_when(!is.na(dial_date) ~ dial_date,
                               is.na(dial_date) ~ dmy("07-09-2023")))
data_pred_kaelin3  |> 
  head(10)
data_pred_kaelin4 <- data_pred_kaelin3 |> 
  mutate(
    male = if_else(sex == "Masculino", 1, 0), 
    risk2y = 1 - 0.9832 ^ exp(-0.2201 * (age / 10 - 7.036) + 0.2467 * (male - 0.5642) - 0.5567 * (eGFR_ckdepi / 5 - 7.222) + 0.4510 * (log(acr) - 5.137)), 
    risk5y = 1 - 0.9365 ^ exp(-0.2201 * (age / 10 - 7.036) + 0.2467 * (male - 0.5642) - 0.5567 * (eGFR_ckdepi / 5 - 7.222) + 0.4510 * (log(acr) - 5.137)), 
    pi2y = -0.2201 * (age / 10 - 7.036) + 0.2467 * (male - 0.5642) - 0.5567 * (eGFR_ckdepi / 5 - 7.222) + 0.4510 * (log(acr) - 5.137), 
    pi5y = -0.2201 * (age / 10 - 7.036) + 0.2467 * (male - 0.5642) - 0.5567 * (eGFR_ckdepi / 5 - 7.222) + 0.4510 * (log(acr) - 5.137), 
    dial_time = as.duration(assess_date %--% dial_date) / ddays(1), 
    death_time = as.duration(assess_date %--% death_date) / ddays(1), 
    dialc = case_when(
      dial == 0 | is.na(dial) ~ 0, 
      dial == 1 & dial_date <= as.Date("2022-12-31") ~ 1, 
      dial == 1 & dial_date > as.Date("2022-12-31") ~ 0,
      TRUE ~ as.numeric(NA)
    ), 
    ddialc = case_when(
      dial_date <= as.Date("2022-12-31") ~ dial_date, 
      dial_date > as.Date("2022-12-31") ~ as.Date("2022-12-31"),
      is.na(dial_date) & deathc == 1 ~ ddeathc, 
      is.na(dial_date) & deathc == 0 ~ as.Date("2022-12-31"), 
      TRUE ~ as.Date(NA)
    ), 
    tdeathc = as.duration(assess_date %--% ddeathc) / dyears(1), 
    tdialc = as.duration(assess_date %--% ddialc) / dyears(1), 
    status_num = case_when(
      dialc == 0 & deathc == 0 ~ 0, 
      dialc == 1 & deathc == 0 ~ 1, #< Evento de interes: dialisis
      dialc == 0 & deathc == 1 & death_time >= 0~ 2, #< Evento en competencia (muerte antes de dialisis)
      dialc == 1 & deathc == 1 & (tdialc <= tdeathc) ~ 1, 
      TRUE ~ as.numeric(NA)
    ), 
    status_num2 = factor(status_num, levels = c(0, 1, 2), 
                         labels = c("Alive w/o Kidney Failure", 
                                    "Kidney Failure", 
                                    "Death w/o Kidney Failure")), 
    time = case_when(
      status_num == 0 ~ tdialc, 
      status_num == 1 ~ tdialc, 
      status_num == 2 ~ tdeathc, 
      TRUE ~ as.numeric(NA)
    ), 
    status_num = as.integer(status_num), 
    grf_cat = case_when(
      eGFR_ckdepi > 90 ~ "G1", 
      eGFR_ckdepi >= 60 & eGFR_ckdepi <= 90 ~ "G2", 
      eGFR_ckdepi >= 45 & eGFR_ckdepi < 60 ~ "G3a", 
      eGFR_ckdepi >= 30 & eGFR_ckdepi < 45 ~ "G3b", 
      eGFR_ckdepi >= 15 & eGFR_ckdepi < 30 ~ "G4", 
      eGFR_ckdepi < 15 ~ "G5", 
      TRUE ~ as.character(NA)
    ), 
    acr2 = urine_album / urine_crea, 
    acr_cat = case_when(
      acr < 30 ~ "A1", 
      acr >= 30 & acr <= 300 ~ "A2", 
      acr > 300 ~ "A3",
      TRUE ~ as.character(NA)
    ), 
    ckd_class = case_when(
      grf_cat %in% c("G1", "G2") & acr_cat == "A1" ~ "Low risk", 
      (grf_cat %in% c("G3a") & acr_cat == "A1") | 
        (grf_cat %in% c("G1", "G2") & acr_cat == "A2") ~ "Moderately increased risk", 
      (grf_cat %in% c("G3b") & acr_cat == "A1") | 
        (grf_cat == "G3a" & acr_cat == "A2") | 
        (grf_cat %in% c("G1", "G2") & acr_cat == "A3") ~ "High risk", 
      (grf_cat %in% c("G4", "G5") & acr_cat == "A1") | 
        (grf_cat %in% c("G3b", "G4", "G5") & acr_cat == "A2") | 
        (grf_cat %in% c("G3a", "G3b", "G4", "G5") & acr_cat == "A3") ~ "Very high risk"
    ), 
    grf_cat = factor(grf_cat, levels = c("G1", "G2", "G3a", "G3b", "G4", "G5")), 
    acr_cat = factor(acr_cat, levels = c("A1", "A2", "A3")), 
    ckd_stage = case_when(
      grf_cat %in% c("G3a", "G3b", "G4") ~ "Stages 3-4", 
      grf_cat %in% c("G1", "G2", "G5") ~ "Stages 1-2 y 5"
    ), 
    ckd_stage = factor(ckd_stage, levels = c("Stages 1-2 y 5", "Stages 3-4")), 
    ckd_stage2 = case_when(
      grf_cat %in% c("G3b", "G4") ~ "Stages 3b-4", 
      grf_cat %in% c("G3a", "G5", "G1", "G2") ~ "Stages 1-3 y 5"
    ), 
    ckd_stage2 = factor(ckd_stage2, levels = c("Stages 1-3 y 5", "Stages 3b-4")), 
    ckd_class = factor(ckd_class, 
                       levels = c("Low risk", 
                                  "Moderately increased risk", 
                                  "High risk", 
                                  "Very high risk")), 
    ckd_class2 = case_when(
      ckd_class %in% c("Low risk", "Moderately increased risk", 
                       "High risk") ~ "Moderately/High risk", 
      ckd_class == "Very high risk" ~ "Very high risk", 
      TRUE ~ as.character(NA)
    ), 
    ckd_class2 = factor(ckd_class2, 
                        levels = c("Moderately/High risk", "Very high risk")), 
    across(where(is.factor), ~droplevels(.)), 
    total = 1, 
    # Censoring to 5 years----
    eventd = case_when(
      status_num2 == "Alive w/o Kidney Failure" ~ 0, 
      status_num2 == "Kidney Failure" ~ 1, 
      status_num2 == "Death w/o Kidney Failure" ~ 2, 
      TRUE ~ as.numeric(NA)
    ), 
    event = case_when(
      status_num2 %in% c("Alive w/o Kidney Failure", "Death w/o Kidney Failure") ~ 0, 
      status_num2 %in% c("Kidney Failure") ~ 1, 
      TRUE ~ as.numeric(NA)
    ),
    time_death5y = censor.time(time, deathc, time.cens = 5)$surv.time.cens, 
    death5y = censor.time(time, deathc, time.cens = 5)$surv.event.cens, 
    time_death2y = censor.time(time, deathc, time.cens = 2)$surv.time.cens, 
    death2y = censor.time(time, deathc, time.cens = 2)$surv.event.cens, 
    time5y = censor.time(time, event, time.cens = 5)$surv.time.cens, 
    event5y = censor.time(time, event, time.cens = 5)$surv.event.cens, 
    eventd5y = censor.time(time, eventd, time.cens = 5)$surv.event.cens, 
    eventd5ylab = case_when(
      eventd5y == 0 ~ "Alive w/o Kidney Failure", 
      eventd5y == 1 ~ "Kidney Failure", 
      eventd5y == 2 ~ "Death w/o Kidney Failure", 
      TRUE ~ as.character(NA)
    ), 
    time2y = censor.time(time, event, time.cens = 2)$surv.time.cens, 
    event2y = censor.time(time, event, time.cens = 2)$surv.event.cens, 
    eventd2y = censor.time(time, eventd, time.cens = 2)$surv.event.cens, 
    eventd2ylab = case_when(
      eventd2y == 0 ~ "Alive w/o Kidney Failure", 
      eventd2y == 1 ~ "Kidney Failure", 
      eventd2y == 2 ~ "Death w/o Kidney Failure", 
      TRUE ~ as.character(NA)
    )
  ) |> 
  set_variable_labels(
    cas = "Centro de atención de salud", 
    sex = "Sexo", 
    male = "Sexo, masculino", 
    age = "Edad (años)", 
    assess_date = "Fecha de evaluación", 
    crea = "Creatinina sérica (mg/dL)", 
    eGFR_ckdepi = "TFG usando CKD-EPI, ml/min/1.73m2", 
    acr = "Relación albúmina-creatinina, mg/g", 
    urine_album = "Albúmina en orina (mg/ml)", 
    urine_crea = "Creatinina en orina (mg/dl)", 
    death_date = "Fecha de defunción", 
    dial_date = "Fecha de hemodiálisis", 
    hta = "Hipertensión", 
    dm = "Diabetes Mellitus", 
    risk2y = "Riesgo pronosticado de fallo renal a 2 años", 
    risk5y = "Riesgo pronosticado de fallo renal a 5 años", 
    pi2y = "Índice pronóstico de fallo renal a 2 años", 
    pi5y = "Índice pronóstico de fallo renal a 5 años",     
    grf_cat = "Categorías de TFG", 
    acr_cat = "Categorías de albuminuria persistente", 
    ckd_class = "Clasificación CKD KDIGO", 
    ckd_class2 = "Clasificación CKD KDIGO",
    ckd_stage = "Etapas de CKD", 
    ckd_stage2 = "Etapas de CKD", 
    status_num = "Resultado", 
    status_num2 = "Resultado", 
    eventd5ylab = "Resultado a 5 años", 
    eventd2ylab = "Resultado a 2 años", 
    eventd5y = "Resultado a 5 años", 
    deathc = "Defunción",
    death5y = "Defunción a 5 años", 
    death2y = "Defunción a 2 años",
    time_death5y = "Tiempo hasta muerte a 5 años", 
    dialc = "Fallo renal", 
    total = "Total", 
    grf_cat = "Categorias de GFR", 
    ckd_class = "Clasificación de CKD KDIGO") 
export(data_pred_kaelin4, here("Data", "Tidy", "data_kaelin.rds"))

0.2.5 Datos de predictores y mortalidad a nivel NACIONAL

Abrimos la data de predictores, la cual corresponde a la última versión de los datos actualizados al 11 de setiembre de 2023.

data_pred_nac <- import(here("Data", 
                             "Raw", 
                             "02082023_bd_predictores_fallec.csv")) |> 
  clean_names()

head(data_pred_nac, 10)

0.2.5.1 Fusionar con datos de dialisis

Lo primero será fusion los datos con hemodialisis:

data_pred_nac <- data_pred_nac |> 
  mutate(auto_cor = str_trim(auto_cor)) |> 
  left_join(data_dial2, by = "auto_cor")
data_pred_nac |> 
  count(dial)

Convertimos la variable indicadora dialisis en formato 1/0:

data_pred_nac <- data_pred_nac |> 
  mutate(dial = case_when(!is.na(dial) ~ dial, 
                          is.na(dial) ~ 0, 
                          TRUE ~ as.numeric(NA)), 
         dial_date = case_when(!is.na(dial_date) ~ dial_date,
                               is.na(dial_date) ~ dmy("07-09-2023")))
data_pred_nac |> 
  count(dial)

0.2.5.2 Deidentificacion

Nos quedamos solo con el autogenerado (auto_cor) y eliminamos los apellido y el dni:

data_pred_nac <- data_pred_nac |> 
  select(-dni, -apellid)

head(data_pred_nac, 10)

0.2.5.3 Limpieza de columnas de interes

Eliminamos columnas innecesarias:

data_pred_nac <- data_pred_nac |> 
  select(-ruta)

head(data_pred_nac, 10)

A continuación vamos a limpiar las columnas de interés:

0.2.5.3.1 Creatinina sérica
  • Armonizando la columna creat_mg_percent:
data_pred_nac |> 
  count(creat_mg_percent) 
data_pred_nac <- data_pred_nac |> 
  mutate(creat_ser_unid = case_match(creat_mg_percent, 
                                     "" ~ "No info", 
                                     c("CREASER (mg%)",  
                                       "CREASER(mg%)", 
                                       "CREAT (mg%)",  
                                       "CREATININA  mg %", 
                                       "CREATININA  mg%", 
                                       "CREATININA  mg/dl", 
                                       "Creatinina (mg%)", 
                                       "CREATININA (mg%)", 
                                       "Creatinina en sangre (mg%)", 
                                       "CREATININA mg %", 
                                       "Creatinina mg%", 
                                       "CREATININA mg%", "CREATININA mg.", 
                                       "Creatinina sangre (mg%)", 
                                       "Creatinina Sangre (mg%)", 
                                       "CREATININA SANGRE (mg%)", 
                                       "CREATININA SANGRE (MG%)", 
                                       "Creatinina Sangre (mg/dl)", 
                                       "CREATININA SERICA  (mg%)", 
                                       "CREATININA SÉRICA mg%", 
                                       "CREATININA SÉRICA mg/dl", 
                                       "CREATININA SERICA(mg%)", 
                                       "CREATININA%", 
                                       "CREATININA(MG%)", 
                                       "Creatinina\nmg%", 
                                       "CREATININA\nmg%") ~ "mg/dl", 
                                     c("CREATININA", "CREATININA SERICA") ~ "No unidades"
                                     ))

data_pred_nac |> 
  count(creat_ser_unid)

Como se puede apreciar de arriba, no sabemos las unidades de algunos resultados de creatinina. Una forma de considerar mantener estos datos o eliminarlos es evaluar si el rango de sus valores cae dentro de lo esperado para esta variable. Por tal motivo, compararemos la distrbución de estos valores.

data_pred_nac |> 
  ggplot(aes(x = creat_ser_unid, y = creatinina_cor)) + 
  geom_boxplot()
Warning: Removed 1034 rows containing non-finite outside the scale range
(`stat_boxplot()`).

Observamos que en la categoría “No info” simplemente no hay valores (todos son perdidos), mientras que en la categoría “No unidades” la dispersión de valores cae dentro del rango aceptable para creatinina sérica. Por tal motivo, decidimos que es razonable concluir que sí sean valores de creatinina en sus unidades original y procederemos a mantenerlos.

data_pred_nac <- data_pred_nac |> 
  mutate(creat_ser_unid = case_match(creat_ser_unid, 
                                     "No info" ~ NA, 
                                     "mg/dl" ~ "mg/dl", 
                                     "No unidades" ~ "mg/dl"
                                     ))

data_pred_nac |> 
  count(creat_ser_unid)
Hmisc::describe(data_pred_nac$creatinina_cor)
data_pred_nac$creatinina_cor 
       n  missing distinct     Info     Mean      Gmd      .05      .10 
  268641     1034     1113    0.999   0.9705   0.3762     0.57     0.62 
     .25      .50      .75      .90      .95 
    0.74     0.90     1.08     1.30     1.53 

lowest : 0.105   0.109   0.11    0.12    0.122  
highest: 18.2    18.2711 18.3    19.2    19.6682

Vamos a conservar todos los valores de creatinina y la guardaremso en una nueva variable llamada crea:

data_pred_nac <- data_pred_nac |> 
  rename(crea = creatinina_cor)

1

1.0.0.0.1 Creatininuria
  • Exploracion inicial de la variable:
data_pred_nac |> 
  count(creatinuria_cor)
data_pred_nac |> 
  count(desc_creatinuria)

Se aprecia que casi todos son datos perdidos, sin embargo, estos datos pueden recuperarse de dos maneras, en ambos casos haciendo uso de las columnas tasa_ac_cor y desc_tasa_ac. Como se puede apreciar, la columna desc_tasa_ac indica si el valor de tasa_ac_cor es el de la razón albuminuria-creatininuria o es el de la creatinuria a secas:

data_pred_nac |> 
  count(desc_tasa_ac)

Vamos a usar esta información para corregir los valores de creatinuria_cor. Primero vamos a simplificar los valores de la variable desc_tasa_ac:

data_pred_nac <- data_pred_nac |> 
  mutate(desc_tasa_ac = case_match(desc_tasa_ac, 
                                    "" ~ "No info", 
                                    c("CREATININA EN ORINA", 
                                      "CREATINU(mg/dl)", 
                                      "CREATINUIRA(mg/dl)", 
                                      "creatinuria",
                                      "Creatinuria", 
                                      "CREATINURIA", 
                                      "Creatinuria  Aclaramiento(mg/dl)", 
                                      "Creatinuria (G/D)", 
                                      "CREATINURIA (mg %)", 
                                      "Creatinuria (mg%)", 
                                      "CREATINURIA (MG%)", 
                                      "Creatinuria (mg/dl)", 
                                      "CREATINURIA (mg/dl)", 
                                      "CREATINURIA (mg/L)", 
                                      "Creatinuria (mgl /dl)", 
                                      "CREATINURIA(mg/dl)") ~ "Creatinuria (mg/dL)", 
                                    c("Tasa  (mg/g)", 
                                      "TASA ALB/CRE", 
                                      "TASA ALB/CREA", 
                                      "Tasa alb/creat.", 
                                      "Tasa Album. / Creatinina (mg/g)", 
                                      "Tasa Albúmina / Creatinina (mg/g)", 
                                      "Tasa Albúmina / Creatinina\n(mg/g)", 
                                      "TASA ALBUMINA /CREATININA mg/g", 
                                      "Tasa Albúmina Creatinina mg/g", 
                                      "Tasa Albúmina/Creati (mg/g)", 
                                      "Tasa albumina/Creatinina (mg/g)", 
                                      "TASA ALBUMINA/CREATININA (mg/g)", 
                                      "Tasa Albúmina/Creatinina (mg/g)", 
                                      "Tasa Albúmina/Creatinina mg/g", 
                                      "TASA ALBÚMINA/CREATININA MG/G",
                                      "Tasa Albúmina/Creatinuria mg/g", 
                                      "Tasa Albuminuria / Creatinuria (mg/g)",
                                      "Tasa Albúminuria/Creatininuria (mg/g)", 
                                      "Tasa Albúminuria/Creatinuria mg/g", 
                                      "Tasa Albuminuria/Creatinuria(mg/g)") ~ "Relación Albuminuria/Creatinuria", 
                                    c("Dep.creat(ml/min)") ~ "Otros"))

data_pred_nac |> 
  count(desc_tasa_ac)

Ahora verificaremos qué rangos de valores manejan cada una de estas categorías:

data_pred_nac |> 
  ggplot(aes(x = desc_tasa_ac, y = tasa_ac_cor)) + 
  geom_boxplot()
Warning: Removed 182062 rows containing non-finite outside the scale range
(`stat_boxplot()`).

stby(data = data_pred_nac, 
     INDICES = data_pred_nac$desc_tasa_ac, 
     FUN = descr, 
     stats = "common", 
     transpose = TRUE)
Non-numerical variable(s) ignored: auto_cor, fn, edad, f_ingreso, creat_mg_percent, desc_creatinuria, albuminuria, desc_albuminuria, albuminemia, desc_albuminemia, tfg, desc_tfg, tasa_ac, desc_tasa_ac, region, fallece, dial_date, diag_cie, creat_ser_unid
Descriptive Statistics  
data_pred_nac  
Group: desc_tasa_ac = Creatinuria (mg/dL)  
N: 156871  

                            Mean      Std.Dev       Min    Median            Max     N.Valid   Pct.Valid
--------------------- ---------- ------------ --------- --------- -------------- ----------- -----------
      albuminemia_cor     468.53      7635.52      0.00      4.23      132207.03     4056.00        2.59
      albuminuria_cor     474.14      7671.38      0.00      4.39      132096.40      582.00        0.37
                 anio    2017.32         2.48   2013.00   2018.00        2022.00   156871.00      100.00
                 crea       0.97         0.53      0.10      0.89          19.67   156332.00       99.66
           creatinina       0.97         0.53      0.10      0.89          19.67   156332.00       99.66
          creatinuria         NA           NA        NA        NA             NA        0.00          NA
      creatinuria_cor         NA           NA        NA        NA             NA        0.00          NA
                 dial       0.01         0.11      0.00      0.00           1.00   156871.00      100.00
                   dm       0.18         0.39      0.00      0.00           1.00   156871.00      100.00
                  hta       0.41         0.49      0.00      0.00           1.00   156871.00      100.00
               hta_dm       0.12         0.33      0.00      0.00           1.00   156871.00      100.00
                 sexo       0.47         0.50      0.00      0.00           1.00   156871.00      100.00
          tasa_ac_cor   13343.48   3532199.98      0.00     42.48   938940931.00    70662.00       45.04
              tfg_cor      83.77        99.92      0.21     79.29       25704.00   133861.00       85.33

Group: desc_tasa_ac = No info  
N: 88956  

                           Mean    Std.Dev       Min    Median         Max    N.Valid   Pct.Valid
--------------------- --------- ---------- --------- --------- ----------- ---------- -----------
      albuminemia_cor   3153.30   19924.98      0.04      4.36   130726.77     541.00        0.61
      albuminuria_cor   1942.49   15778.24      0.01      4.40   130421.57     135.00        0.15
                 anio   2016.39       2.51   2013.00   2016.00     2022.00   88956.00      100.00
                 crea      0.98       0.54      0.11      0.90       18.20   88599.00       99.60
           creatinina      0.98       0.54      0.11      0.90       18.20   88599.00       99.60
          creatinuria     38.07      33.15      5.70     16.30       89.58      15.00        0.02
      creatinuria_cor     38.07      33.15      5.70     16.30       89.58      15.00        0.02
                 dial      0.01       0.12      0.00      0.00        1.00   88956.00      100.00
                   dm      0.24       0.42      0.00      0.00        1.00   88956.00      100.00
                  hta      0.52       0.50      0.00      1.00        1.00   88956.00      100.00
               hta_dm      0.06       0.24      0.00      0.00        1.00   88956.00      100.00
                 sexo      0.44       0.50      0.00      0.00        1.00   88956.00      100.00
          tasa_ac_cor        NA         NA        NA        NA          NA       0.00          NA
              tfg_cor     84.54     269.88      0.23     78.20    68050.00   65132.00       73.22

Group: desc_tasa_ac = Otros  
N: 43  

                           Mean   Std.Dev       Min    Median       Max   N.Valid   Pct.Valid
--------------------- --------- --------- --------- --------- --------- --------- -----------
      albuminemia_cor        NA        NA        NA        NA        NA      0.00          NA
      albuminuria_cor        NA        NA        NA        NA        NA      0.00          NA
                 anio   2019.00      0.00   2019.00   2019.00   2019.00     43.00      100.00
                 crea      0.90      0.28      0.54      0.82      1.67     43.00      100.00
           creatinina      0.90      0.28      0.54      0.82      1.67     43.00      100.00
          creatinuria        NA        NA        NA        NA        NA      0.00          NA
      creatinuria_cor        NA        NA        NA        NA        NA      0.00          NA
                 dial      0.00      0.00      0.00      0.00      0.00     43.00      100.00
                   dm      0.19      0.39      0.00      0.00      1.00     43.00      100.00
                  hta      0.67      0.47      0.00      1.00      1.00     43.00      100.00
               hta_dm      0.14      0.35      0.00      0.00      1.00     43.00      100.00
                 sexo      0.47      0.50      0.00      0.00      1.00     43.00      100.00
          tasa_ac_cor     77.24     22.12     29.74     77.53    116.93     43.00      100.00
              tfg_cor        NA        NA        NA        NA        NA      0.00          NA

Group: desc_tasa_ac = Relación Albuminuria/Creatinuria  
N: 23805  

                           Mean    Std.Dev       Min    Median         Max    N.Valid   Pct.Valid
--------------------- --------- ---------- --------- --------- ----------- ---------- -----------
      albuminemia_cor   2695.86   18485.72      0.20      4.23   134556.94     343.00        1.44
      albuminuria_cor   1908.84   15338.64      0.03      4.40   130177.97      72.00        0.30
                 anio   2016.72       2.68   2013.00   2016.00     2022.00   23805.00      100.00
                 crea      0.98       0.46      0.17      0.90       16.70   23667.00       99.42
           creatinina      0.98       0.46      0.17      0.90       16.70   23667.00       99.42
          creatinuria        NA         NA        NA        NA          NA       0.00          NA
      creatinuria_cor        NA         NA        NA        NA          NA       0.00          NA
                 dial      0.01       0.08      0.00      0.00        1.00   23805.00      100.00
                   dm      0.18       0.39      0.00      0.00        1.00   23805.00      100.00
                  hta      0.43       0.50      0.00      0.00        1.00   23805.00      100.00
               hta_dm      0.26       0.44      0.00      0.00        1.00   23805.00      100.00
                 sexo      0.50       0.50      0.00      1.00        1.00   23805.00      100.00
          tasa_ac_cor   1201.85   14201.39      0.00     32.57   504864.86   16908.00       71.03
              tfg_cor     83.34      68.16      0.71     77.40     2014.00   20352.00       85.49

Se aprecia que para la categoría de creatinuria hay un valor excesivamente alto que supera los billone. Valores en este rango son definitivamente no plausibles por lo que procederemos a eliminarlos:

data_pred_nac <- data_pred_nac |> 
  mutate(tasa_ac_cor = if_else(tasa_ac_cor > 9e8, NA, tasa_ac_cor))

Ahora volveremos a verificar qué rangos de valores manejan cada una de estas categorías:

data_pred_nac |> 
  ggplot(aes(x = desc_tasa_ac, y = tasa_ac_cor)) + 
  geom_boxplot()
Warning: Removed 182063 rows containing non-finite outside the scale range
(`stat_boxplot()`).

Como era de esperarse, ahora solo la relación albuminuiria/creatinuria maneja rangos muy grandes. Sin embargo, por experiencia previa de análisi de datos de esta variable en nuestro equipo de investigación, hemos podido apreciar valores tan altos como 150 mil. Creemos que, por ser una variable de razón, es posible tener valores inusualmente altos. Por tal motivo, mantendremos estos valores hasta el final del procesamiento y, posteriormente, veremos qué decisión tomar para su manejo.

Ahora veamos mejor el rango de valores analizando solo creatinuria vs otros.

data_pred_nac |> 
  filter(desc_tasa_ac != "Relación Albuminuria/Creatinuria") |> 
  ggplot(aes(x = desc_tasa_ac, y = tasa_ac_cor)) + 
  geom_boxplot()
Warning: Removed 175166 rows containing non-finite outside the scale range
(`stat_boxplot()`).

Procedemos a crear la variable creatinuria

data_pred_nac <- data_pred_nac |> 
  mutate(urine_crea = case_when(desc_tasa_ac == "Creatinuria (mg/dL)" & 
                                  is.na(creatinuria_cor) ~ tasa_ac_cor, 
                                desc_tasa_ac == "Creatinuria (mg/dL)" & 
                                  !is.na(creatinuria_cor) ~ creatinuria_cor, 
                                TRUE ~ as.numeric(NA)))
data_pred_nac |> 
  filter(desc_tasa_ac != "Relación Albuminuria/Creatinuria") |> 
  ggplot(aes(x = desc_tasa_ac, y = urine_crea)) + 
  geom_boxplot()
Warning: Removed 175209 rows containing non-finite outside the scale range
(`stat_boxplot()`).

1.0.0.0.2 Albuminuria

Con esta variable tenemos el problema de que hay dos columnas que podrían contener valores. Vamos a limpiar cada columna por separdo y luego integrarlos en una sola variable.

data_pred_nac |> 
  count(desc_albuminuria)
data_pred_nac <- data_pred_nac |> 
  mutate(desc_albuminuria2 = case_match(desc_albuminuria, 
                                        "" ~ "No info", 
                                        c("ALBUM  (mg/dl)", 
                                          "ALBUM(g/dl)", 
                                          "ALBUMINA", 
                                          "ALBUMINA  (g/dl)", 
                                          "ALBUMINA  (mg/dl)", 
                                          "ALBUMINA (g/dl)", 
                                          "albuminuria", 
                                          "ALBUMINURIA", 
                                          "ALBUMINURIA (mg %)", 
                                          "Albuminuria (mg%)", 
                                          "ALBUMINURIA (MG%)", 
                                          "Albuminuria (mg%) microalbuminuria", 
                                          "Albuminuria (mg/dl)", 
                                          "Albuminuria (mg/dL)", 
                                          "ALBUMINURIA (mg/dl)", 
                                          "Albuminuria(mg%)", 
                                          "Albuminuria(mg/dl)", 
                                          "ALBUMINURIA(mg/dl)") ~ "Albuminuria", 
                                        c("ALBUM  >3,5", 
                                          "ALBUM>3.5", 
                                          "ALBUMINA > 3.5", 
                                          "ALBÚMINA >3",                                           
                                          "ALBÚMINA >3,5", 
                                          "ALBUMINA >3.5") ~ "Albumin(?)", 
                                        c("Micro albuminuria", 
                                          "MICRO ALBUMINURIA", 
                                          "Micro Albuminuria (mg%)", 
                                          "MICROALBUMINURIA", 
                                          "Mircoalbuminuria en tira reactiva") ~ 
                                          "Microalbuminuria"))
data_pred_nac |> 
  count(desc_albuminuria2)
stby(data = data_pred_nac %>% select(desc_albuminuria2, albuminuria_cor), 
     INDICES = data_pred_nac$desc_albuminuria2, 
     FUN = descr, 
     stats = "common", 
     transpose = TRUE)
Non-numerical variable(s) ignored: desc_albuminuria2
Descriptive Statistics  
albuminuria_cor by desc_albuminuria2  
Data Frame: data_pred_nac  
N: 4537  

                            Mean    Std.Dev     Min   Median         Max   N.Valid   Pct.Valid
---------------------- --------- ---------- ------- -------- ----------- --------- -----------
            Albumin(?)     80.88      82.23   21.00    50.00      202.50      4.00        0.09
           Albuminuria    738.04    9587.62    0.00     4.40   132096.40    738.00        3.97
      Microalbuminuria      7.18      10.10    0.28     0.35       22.70      5.00        0.95
               No info   3109.28   20123.85    0.70     4.26   130421.57     42.00        0.02

Se aprecian valores inusualmente altos de albuminuria, los cuales consideraremos como dato perdido por ser implausibles:

data_pred_nac |> 
  ggplot(aes(x = desc_albuminuria2, y = albuminuria_cor)) + 
  geom_boxplot()
Warning: Removed 268886 rows containing non-finite outside the scale range
(`stat_boxplot()`).

Convertimos a NA a los valores >1e5 y volvemos a verificar:

data_pred_nac <- data_pred_nac |> 
  mutate(albuminuria_cor = if_else(albuminuria_cor > 1e5, NA, albuminuria_cor))
data_pred_nac |> 
  ggplot(aes(x = desc_albuminuria2, y = albuminuria_cor)) + 
  geom_boxplot()
Warning: Removed 268891 rows containing non-finite outside the scale range
(`stat_boxplot()`).

stby(data = data_pred_nac %>% select(desc_albuminuria2, albuminuria_cor), 
     INDICES = data_pred_nac$desc_albuminuria2, 
     FUN = descr, 
     stats = "common", 
     transpose = TRUE)
Non-numerical variable(s) ignored: desc_albuminuria2
Descriptive Statistics  
albuminuria_cor by desc_albuminuria2  
Data Frame: data_pred_nac  
N: 4537  

                          Mean   Std.Dev     Min   Median       Max   N.Valid   Pct.Valid
---------------------- ------- --------- ------- -------- --------- --------- -----------
            Albumin(?)   80.88     82.23   21.00    50.00    202.50      4.00        0.09
           Albuminuria   30.89    179.81    0.00     4.40   3390.00    734.00        3.95
      Microalbuminuria    7.18     10.10    0.28     0.35     22.70      5.00        0.95
               No info    4.10      0.73    0.70     4.22      5.21     41.00        0.02

Por último, hemos vemos que hay muy pocos individuos con Albumin(?), con microalbuminuria y sin Información. Debido a que no estamos seguros qué valores son y la proporción de etos es insignificante, decidimos convertir en dato perdido estos valores.

data_pred_nac <- data_pred_nac |> 
  mutate(pre_urine_album1 = if_else(desc_albuminuria2 == "Albuminuria", albuminuria_cor, NA))
data_pred_nac |> 
  ggplot(aes(x = desc_albuminuria2, y = pre_urine_album1)) + 
  geom_boxplot()
Warning: Removed 268941 rows containing non-finite outside the scale range
(`stat_boxplot()`).

1.0.0.0.3 Albuminemia

Con esta variable tenemos el problema de que hay dos columnas que podrían contener valores. Vamos a limpiar cada columna por separdo y luego integrarlos en una sola variable.

data_pred_nac |> 
  count(desc_albuminemia)
data_pred_nac <- data_pred_nac |> 
  mutate(desc_albuminemia2 = case_match(desc_albuminemia, 
                                        "" ~ "No info", 
                                        c("ALBUMINURIA", 
                                          "ALBUMINURIA (mg %)", 
                                          "Albuminuria (mg%)", 
                                          "ALBUMINURIA (MG%)", 
                                          "Albuminuria (mg/dl)", 
                                          "Albuminuria (mg/L)") ~ "Albuminuria", 
                                        c("ALBUM", 
                                          "ALBUM  (mg/dl)", 
                                          "ALBUM  >3,5", 
                                          "ALBUM >3,5", 
                                          "ALBUM(g/dl)", 
                                          "ALBUM>3.5", 
                                          "ALBUMINA", 
                                          "ALBUMINA  (g/dl)", 
                                          "ALBUMINA  (mg/dl)", 
                                          "ALBUMINA  >3,5", 
                                          "ALBUMINA (g/dl)", 
                                          "ALBUMINA >3.5") ~ "Albumin(?)", 
                                        c("ALBUMINA  (mg/dl)-SERICA") ~ "Albuminemia", 
                                        c("Examenes deñl 21 de Nov. 2011, no tiene microalbuminuria", 
                                          "Micro albuminuria", "MICRO ALBUMINURIA", 
                                          "MICROALBUMINURIA", 
                                          "Mircoalbuminuria en tira reactiva", 
                                          "MOCROALBUMINURIA EN TIRA REACTIVA", 
                                          "SI ALBUMINA CORREGIDA(1)") ~ 
                                          "Microalbuminuria"))
data_pred_nac |> 
  ggplot(aes(x = desc_albuminemia2, y = albuminemia_cor)) + 
  geom_boxplot()
Warning: Removed 264735 rows containing non-finite outside the scale range
(`stat_boxplot()`).

Convertimos a NA a los valores >1e5 y volvemos a verificar:

data_pred_nac <- data_pred_nac |> 
  mutate(albuminemia_cor = if_else(albuminemia_cor > 1e5, NA, albuminemia_cor))
data_pred_nac |> 
  ggplot(aes(x = desc_albuminemia2, y = albuminemia_cor)) + 
  geom_boxplot()
Warning: Removed 264769 rows containing non-finite outside the scale range
(`stat_boxplot()`).

stby(data = data_pred_nac %>% select(desc_albuminemia2, albuminemia_cor), 
     INDICES = data_pred_nac$desc_albuminemia2, 
     FUN = descr, 
     stats = "common", 
     transpose = TRUE)
Warning: There were 2 warnings in `summarise()`.
The first warning was:
ℹ In argument: `min = (structure(function (..., .x = ..1, .y = ..2, . = ..1)
  ...`.
Caused by warning in `min()`:
! ningún argumento finito para min; retornando Inf
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
Non-numerical variable(s) ignored: desc_albuminemia2
Descriptive Statistics  
albuminemia_cor by desc_albuminemia2  
Data Frame: data_pred_nac  
N: 40839  

                          Mean   Std.Dev    Min   Median       Max   N.Valid   Pct.Valid
---------------------- ------- --------- ------ -------- --------- --------- -----------
            Albumin(?)   14.35    119.31   0.01     4.20   5328.85   4113.00       10.07
           Albuminemia   25.88     15.82   2.51    33.80     50.40     22.00      100.00
           Albuminuria   43.84    107.48   0.00    20.00   2015.00    423.00       10.61
      Microalbuminuria   75.80    366.55   0.30     4.25   2013.00    348.00       30.21
               No info      NA        NA     NA       NA        NA      0.00          NA

Por último, para ser consistentes con el manejo de albuminuria, como desconocemos que es Albulim, vamos a convertir a estos individuos en datos perdidos. Del mismo modo, los que son Albuminemia también serán eliminados porque representan otra variable. Las microalbuminurias también serán eliminadas.

data_pred_nac <- data_pred_nac |> 
  mutate(pre_urine_album2 = if_else(desc_albuminemia2 == "Albuminuria", albuminemia_cor, NA))
data_pred_nac |> 
  ggplot(aes(x = desc_albuminemia2, y = pre_urine_album2)) + 
  geom_boxplot()
Warning: Removed 269252 rows containing non-finite outside the scale range
(`stat_boxplot()`).

  • Por ultimo, debemos integrar los datos de las albuminurias de ambas variables:
data_pred_nac <- data_pred_nac |> 
  mutate(val_album = case_when(is.na(pre_urine_album1) & is.na(pre_urine_album2) ~ "Ambos NA", 
                               is.na(pre_urine_album1) & !is.na(pre_urine_album2) ~ "NA - Dato", 
                               !is.na(pre_urine_album1) & is.na(pre_urine_album2) ~ "Dato - NA", 
                               !is.na(pre_urine_album1) & !is.na(pre_urine_album2) ~ "Dato - Dato"))

data_pred_nac |> 
   count(val_album)

Se aprecia que siempre que no hay doble dato de albuminuria en cada columna, por tal motivo, procederemos a rescatar los datos:

data_pred_nac <- data_pred_nac |> 
  mutate(urine_album = case_when(val_album == "Ambos NA" ~ as.numeric(NA), 
                                 val_album == "Dato - NA" ~ pre_urine_album1, 
                                 val_album == "NA - Dato" ~ pre_urine_album2))
skimr::skim(data_pred_nac$urine_album)
Data summary
Name data_pred_nac$urine_album
Number of rows 269675
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 268518 0 35.62 157.34 0 3.67 6.6 20.84 3390 ▇▁▁▁▁
1.0.0.0.4 Razón albuminuria/creatinuria

Vamos tambien a crear una variable que solo contenga la razón albuminuria-creatinuria (RAC):

data_pred_nac <- data_pred_nac |> 
  mutate(acr = case_when(
    desc_tasa_ac == "Relación Albuminuria/Creatinuria" ~ tasa_ac_cor, 
    TRUE ~ as.numeric(NA)
    ))

data_pred_nac |> 
  count(acr)
skimr::skim(data_pred_nac$acr)
Data summary
Name data_pred_nac$acr
Number of rows 269675
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 252767 0.06 1201.85 14201.39 0 4.95 32.57 49.4 504864.9 ▇▁▁▁▁
  • Ahora crearmos una variable que calcule los acr_calc en base a la informacion disponible de albuminura y creatinuria que tenemos:
data_pred_nac <- data_pred_nac |> 
  mutate(acr_calc = urine_album / urine_crea)
data_pred_nac <- data_pred_nac |> 
  mutate(val_acr = case_when(is.na(acr_calc) & is.na(acr) ~ "Ambos NA", 
                               is.na(acr_calc) & !is.na(acr) ~ "NA - Dato", 
                               !is.na(acr_calc) & is.na(acr) ~ "Dato - NA", 
                               !is.na(acr_calc) & !is.na(acr) ~ "Dato - Dato"))

 data_pred_nac |> 
   count(val_acr)
skimr::skim(data_pred_nac$acr)
Data summary
Name data_pred_nac$acr
Number of rows 269675
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 252767 0.06 1201.85 14201.39 0 4.95 32.57 49.4 504864.9 ▇▁▁▁▁
nrow(data_pred_nac) - 252767
[1] 16908

Podemos apreciar que no hay dos columnas con datos, lo que hace que no se contradigan los valores. Vamos a proceder a actualizar la variable ACR para que contenga todos los valore de ACR que se puedan:

data_pred_nac <- data_pred_nac |> 
  mutate(acr = case_when(val_acr == "Ambos NA" ~ as.numeric(NA), 
                         val_acr == "Dato - NA" ~ acr_calc, 
                         val_acr== "NA - Dato" ~ acr))
skimr::skim(data_pred_nac$acr)
Data summary
Name data_pred_nac$acr
Number of rows 269675
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 251946 0.07 1147.83 13870.84 0 4.84 30 49.26 504864.9 ▇▁▁▁▁
nrow(data_pred_nac) - 251946
[1] 17729
  • Por ultimo, trataremos de recuperar los valores de de albuminuria y creatinuria que no estaban disponibles:
data_pred_nac <- data_pred_nac |> 
  mutate(val_album_crea_acr = case_when(is.na(urine_album) & is.na(urine_crea) & is.na(acr) ~ "NA - NA - NA", 
                                        is.na(urine_album) & !is.na(urine_crea) & !is.na(acr) ~ "NA - Dato - Dato", 
                                        !is.na(urine_album) & is.na(urine_crea) & !is.na(acr) ~ "Dato - NA - Dato",
                                        !is.na(urine_album) & !is.na(urine_crea) & is.na(acr) ~ "Dato - Dato - NA",
                                        is.na(urine_album) & is.na(urine_crea) & !is.na(acr) ~ "NA - NA - Dato",
                                        is.na(urine_album) & !is.na(urine_crea) & is.na(acr) ~ "NA - Dato - NA", 
                                        !is.na(urine_album) & is.na(urine_crea) & is.na(acr) ~ "Dato - NA - NA", 
                                        !is.na(urine_album) & !is.na(urine_crea) & !is.na(acr) ~ "Dato - Dato - Dato"
  ))

data_pred_nac |> 
  count(val_album_crea_acr)
data_pred_nac  <- data_pred_nac |> 
  mutate(urine_crea = case_when(val_album_crea_acr == "Dato - NA - Dato" ~ urine_album / acr, 
                                TRUE ~ urine_crea))
data_pred_nac <- data_pred_nac |> 
  mutate(val_album_crea_acr2 = case_when(is.na(urine_album) & is.na(urine_crea) & is.na(acr) ~ "NA - NA - NA", 
                                        is.na(urine_album) & !is.na(urine_crea) & !is.na(acr) ~ "NA - Dato - Dato", 
                                        !is.na(urine_album) & is.na(urine_crea) & !is.na(acr) ~ "Dato - NA - Dato",
                                        !is.na(urine_album) & !is.na(urine_crea) & is.na(acr) ~ "Dato - Dato - NA",
                                        is.na(urine_album) & is.na(urine_crea) & !is.na(acr) ~ "NA - NA - Dato",
                                        is.na(urine_album) & !is.na(urine_crea) & is.na(acr) ~ "NA - Dato - NA", 
                                        !is.na(urine_album) & is.na(urine_crea) & is.na(acr) ~ "Dato - NA - NA", 
                                        !is.na(urine_album) & !is.na(urine_crea) & !is.na(acr) ~ "Dato - Dato - Dato"
  ))

data_pred_nac |> 
  count(val_album_crea_acr2)

Notar que la mayoría de veces, solo tengo el dato de la razón albuminuria-creatinuria, pero no tengo los demás valores.

1.0.0.0.5 Fecha de ingreso

Exploraremos la data de fecha de ingreso:

data_pred_nac |> 
  count(f_ingreso) |> 
  head()

Luego procedemos a transforma los datos en una columna de tipo fecha:

data_pred_nac <- data_pred_nac |> 
  mutate(f_ingreso = dmy(f_ingreso))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `f_ingreso = dmy(f_ingreso)`.
Caused by warning:
!  15 failed to parse.

Verificamos que la conversion este bien hecha:

data_pred_nac |> 
  count(f_ingreso) |> 
  head()

Creamos una nueva variable que se llame:

data_pred_nac <- data_pred_nac |> 
  mutate(assess_date = f_ingreso)
1.0.0.0.6 Fecha de nacimiento

Primero exploremos la estructura de la variable fecha de nacimiento (fn):

data_pred_nac |> 
  count(fn)

Luego convertimos la variable de string a clase date:

data_pred_nac <- data_pred_nac |> 
  mutate(f_nac = dmy(fn))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `f_nac = dmy(fn)`.
Caused by warning:
!  413 failed to parse.

Se verifica la conversion a date:

data_pred_nac |> 
  count(f_nac)

Creamos una nueva variable que se llame nac_date:

data_pred_nac <- data_pred_nac |> 
  mutate(nac_date = f_nac)
1.0.0.0.7 Edad
  • Edad calculada restando fechas:
data_pred_nac <- data_pred_nac |> 
  mutate(age = floor(as.interval(f_nac %--% f_ingreso) / dyears(1)))
1.0.0.0.8 Sexo
data_pred_nac <- data_pred_nac |> 
  mutate(sex = case_match(sexo, 
                          0 ~ "Femenino", 
                          1 ~ "Masculino"), 
         sex = factor(sex, 
                      levels = c("Femenino", "Masculino")))

Tambien crearemos la variable male:

data_pred_nac <- data_pred_nac |> 
  mutate(male = if_else(sex == "Masculino", 1, 0))
1.0.0.0.9 Hipertension

Veamos como esta estructurado el indicador de hta:

data_pred_nac |> 
  tabyl(hta, hta_dm)

La variable hta_dm debe distribuirse dentro de hta:

data_pred_nac <- data_pred_nac |> 
  mutate(hta = case_when(hta == 1 | hta_dm == 1 ~ 1, 
                         hta == 0 & hta_dm == 0 ~ 0, 
                         TRUE ~ as.numeric(NA)))

Se verifica que la transformacion fue apropiada:

data_pred_nac |> 
  tabyl(hta, hta_dm)
1.0.0.0.10 Diabetes Mellitus

Veamos como esta estructurado el indicador de dm:

data_pred_nac |> 
  tabyl(dm, hta_dm)

La variable hta_dm debe distribuirse dentro de hta:

data_pred_nac <- data_pred_nac |> 
  mutate(dm = case_when(dm == 1 | hta_dm == 1 ~ 1, 
                         dm == 0 & hta_dm == 0 ~ 0, 
                         TRUE ~ as.numeric(NA)))

Se verifica que la transformacion fue apropiada:

data_pred_nac |> 
  tabyl(dm, hta_dm)

1.0.0.1 Tasa de filtración glomerural

Aunque tenemos una variable de tasa de filtracion glomerual estiamda, se aprecia que esta proviene de diversos métodos de cálculos, por lo que no podemos considerarla para el análisis.

data_pred_nac |> 
  count(desc_tfg)
data_pred_nac |> 
  count(tfg_cor)
Hmisc::describe(data_pred_nac$tfg_cor)
data_pred_nac$tfg_cor 
       n  missing distinct     Info     Mean      Gmd      .05      .10 
  219345    50330    31569        1    83.96    34.88    42.51    50.83 
     .25      .50      .75      .90      .95 
   63.56    78.77    96.18   116.20   132.45 

lowest : 0.21     0.23     0.4      0.44169  0.443309
highest: 2018     2914     6105     25704    68050   

Por tanto, vamos a volver a crear la tasa de filtracion glomerular estimada usando el metodo de CKD-EPI:

data_pred_nac <- data_pred_nac |> 
  mutate(eGFR_ckdepi = case_when(
    crea <= 0.7 & sex == "Femenino" ~ 144 * ((crea / 0.7) ^ (-0.329)) * (0.993 ^ (age)) * 1, 
    crea > 0.7 & sex == "Femenino" ~ 144 * ((crea / 0.7) ^ (-1.209)) * (0.993 ^ (age)) * 1, 
    crea <= 0.9 & sex == "Masculino" ~ 141 * ((crea / 0.9) ^ (-0.411)) * (0.993 ^ (age)) * 1, 
    crea > 0.9 & sex == "Masculino" ~ 141 * ((crea / 0.9) ^ (-1.209)) * (0.993 ^ (age)) * 1, 
    TRUE ~ as.numeric(NA)
    ))
Hmisc::describe(data_pred_nac$eGFR_ckdepi)
data_pred_nac$eGFR_ckdepi 
       n  missing distinct     Info     Mean      Gmd      .05      .10 
  225221    44454    16950        1    77.04    22.62    42.59    50.81 
     .25      .50      .75      .90      .95 
   63.64    78.65    91.86   100.80   106.25 

lowest : 1.69992 1.72647 1.83564 1.90941 1.98967
highest: 197.512 204.173 211.799 213.292 216.31 
1.0.0.1.1 Region de procedencia
data_pred_nac |> 
  count(region)
data_pred_nac <- data_pred_nac |> 
  mutate(cas = str_trim(region), 
         cas = str_to_title(region))
data_pred_nac |> 
  count(cas)
1.0.0.1.2 Fallecimiento

Vemos que la variable fallece tiene las columnas de fechas de fallecimiento totalmente desarmonizadas. Vamos a convertirlas a formato fecha:

data_pred_nac |> 
  count(fallece) |> 
  head()
data_pred_nac <- data_pred_nac |> 
  mutate(death_date = dmy(fallece))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `death_date = dmy(fallece)`.
Caused by warning:
!  225527 failed to parse.

Primero procederemos a crear la variable death:

data_pred_nac <- data_pred_nac |> 
  mutate(death = case_when(!is.na(death_date) ~ 1, 
                           is.na(death_date) ~ 0, 
                           TRUE ~ 0))

Luego procederemos a censurar las fechas que no son de muerte para el 02 de agosto de 2023.

data_pred_nac <- data_pred_nac |> 
  mutate(death_date = case_when(is.na(death_date ) ~ dmy("02-08-2023"), 
                                TRUE ~ death_date))
data_pred_nac |> 
  count(death)

Por ultimo, crearemos algunas variables extras relacionadas con la censura del presente estudio:

data_pred_nac <- data_pred_nac |> 
  mutate(    
    deathc = case_when(
      death == 0 | is.na(death) ~ 0, 
      death == 1 & death_date <= as.Date("2022-12-31") ~ 1, 
      death == 1 & death_date > as.Date("2022-12-31") ~ 0, 
      TRUE ~ as.numeric(NA)
    ),
    ddeathc = case_when(
      death_date <= as.Date("2022-12-31") ~ death_date, 
      death_date > as.Date("2022-12-31") | is.na(death_date) ~ as.Date("2022-12-31"),
      TRUE ~ as.Date(NA)
    ))
data_pred_nac |> 
  count(deathc)
1.0.0.1.3 Año
data_pred_nac |> 
  count(anio)
data_pred_nac <- data_pred_nac |> 
  mutate(year = year(assess_date))
1.0.0.1.4 Selección de columnas finales
datos <- data_pred_nac |> 
  select(auto_cor, sex, age, hta, dm, cas, nac_date, assess_date, crea, 
         eGFR_ckdepi, urine_album, urine_crea, acr, death_date, death,  deathc, 
         ddeathc, dial, dial_date, diag_cie)

1.0.0.2 Completado de celdas vacias

Para cada individuo con su fecha, vamos a completar las celdas con datos contiguos.

Vamos a contar el numero de datos perdidos que hay por cada fila, ordenarlas de mayor completitud a menor completitud de datos y eliminar todos los duplicados solo para quedarnos con las filas que más datos tengan.

datos$nmissing <- apply(datos, 1, function(x){sum(is.na(x))})
datos2 <- datos
VIM::aggr(datos)

datos |> 
  ggplot(aes(x = eGFR_ckdepi, y = acr)) + 
  scale_y_continuous(trans = "log10") + 
  geom_miss_point() + 
  facet_wrap(dial ~ .)

gg_miss_upset(datos)

gg_miss_upset(datos |>  select(eGFR_ckdepi, acr))

Luego vamos a proceder a completar las celdas perdidas para cada variable:

columnas_para_llenar <- c("sex", "age", "hta", "dm", "cas", "nac_date", 
                          "crea", "eGFR_ckdepi", "urine_album", "urine_crea", 
                          "acr")

datos2 <- datos2 |> 
  arrange(auto_cor)

for (col in columnas_para_llenar) {
  datos2 <- datos2 |> 
    arrange(auto_cor, !!sym(col)) |> 
    group_by(auto_cor, !!sym(col)) |> 
    fill(!!sym(col), .direction = "up") |> 
    ungroup()
}
datos2 |> 
  count(cas)

1.0.0.3 Identificacion y eliminacion de duplicados

1.0.0.3.1 Duplicados de fila completa

A continuacion vamos a hacer una primera limpieza identificado duplicados indenticos de linea y eliminandolos. Se identificaron 9506 filas con duplicados de toda la fila exactamente iguales:

datos2 |> 
  arrange(auto_cor, nmissing) |> 
  get_dupes()
No variable names specified - using all columns.

Procederemos a eliminarlos y quedarnos solo con uno:

datos2 <- datos2 |> 
  arrange(auto_cor, nmissing) |> 
  distinct(.keep_all = TRUE)

Por ultimo, verificamos que ya no existan estos duplicados de fila completa:

datos2 |> 
  arrange(auto_cor, nmissing) |> 
  get_dupes()
No variable names specified - using all columns.
No duplicate combinations found of: auto_cor, sex, age, hta, dm, cas, nac_date, assess_date, crea, ... and 12 other variables
1.0.0.3.2 Duplicados de ID + fecha de ingreso
datos2 |> 
  arrange(auto_cor, nmissing) |> 
  get_dupes(auto_cor, assess_date)
datos2 <- datos2 |> 
  arrange(auto_cor, assess_date, nmissing)
datos2 |> 
  arrange(auto_cor, assess_date, nmissing) |> 
  get_dupes(auto_cor, assess_date)

Procederemos a eliminarlos y quedarnos solo con la primera fila que ha sido ordenada de tal manera que tenga la menor cantidad de datos perdidos:

datos2 <- datos2 |> 
  arrange(auto_cor, assess_date, nmissing) |> 
  distinct(auto_cor, assess_date, .keep_all = TRUE)
datos2 |> 
  arrange(auto_cor, assess_date, nmissing) |> 
  get_dupes(auto_cor, assess_date)
No duplicate combinations found of: auto_cor, assess_date
1.0.0.3.3 Duplicados de ID unico

Por último, veremos los duplicados de ID unico:

datos2 |>
  arrange(auto_cor, assess_date, nmissing) |> 
  get_dupes(auto_cor)

Procedemos a ordenarlos para que el menor numero de datos perdidos por fila esté en la primera fila:

datos2 <- datos2 |> 
  arrange(auto_cor, assess_date)
datos2 |> 
  arrange(auto_cor, assess_date, nmissing) |> 
  get_dupes(auto_cor)

Procederemos a eliminar las primeras filas:

datos2 <- datos2 |> 
  arrange(auto_cor, assess_date, nmissing) |> 
  distinct(auto_cor, .keep_all = TRUE)
datos2 |> 
  arrange(auto_cor, assess_date, nmissing) |> 
  get_dupes(auto_cor)
No duplicate combinations found of: auto_cor
datos2 |> 
  count(cas)
1.0.0.3.4 Datos perdidos
sum_mis_data_pred_nac <- miss_summary(datos2)
sum_mis_data_pred_nac
sum_mis_data_pred_nac$miss_var_summary |> 
  knitr::kable()
variable n_miss pct_miss
urine_album 140930 99.5
diag_cie 140371 99.2
acr 130650 92.3
urine_crea 99047 70.0
eGFR_ckdepi 23928 16.9
age 23496 16.6
assess_date 23266 16.4
crea 473 0.334
nac_date 250 0.177
auto_cor 0 0
sex 0 0
hta 0 0
dm 0 0
cas 0 0
death_date 0 0
death 0 0
deathc 0 0
ddeathc 0 0
dial 0 0
dial_date 0 0
nmissing 0 0

1.0.0.4 Limpieza de datos

datos3 <- datos2 |> 
  mutate(
    male = if_else(sex == "Masculino", 1, 0), 
    risk2y = 1 - 0.9832 ^ exp(-0.2201 * (age / 10 - 7.036) + 0.2467 * (male - 0.5642) - 0.5567 * (eGFR_ckdepi / 5 - 7.222) + 0.4510 * (log(acr) - 5.137)), 
    risk5y = 1 - 0.9365 ^ exp(-0.2201 * (age / 10 - 7.036) + 0.2467 * (male - 0.5642) - 0.5567 * (eGFR_ckdepi / 5 - 7.222) + 0.4510 * (log(acr) - 5.137)), 
    pi2y = -0.2201 * (age / 10 - 7.036) + 0.2467 * (male - 0.5642) - 0.5567 * (eGFR_ckdepi / 5 - 7.222) + 0.4510 * (log(acr) - 5.137), 
    pi5y = -0.2201 * (age / 10 - 7.036) + 0.2467 * (male - 0.5642) - 0.5567 * (eGFR_ckdepi / 5 - 7.222) + 0.4510 * (log(acr) - 5.137), 
    dial_time = as.duration(assess_date %--% dial_date) / ddays(1), 
    death_time = as.duration(assess_date %--% death_date) / ddays(1), 
    dialc = case_when(
      dial == 0 | is.na(dial) ~ 0, 
      dial == 1 & dial_date <= as.Date("2022-12-31") ~ 1, 
      dial == 1 & dial_date > as.Date("2022-12-31") ~ 0,
      TRUE ~ as.numeric(NA)
    ), 
    ddialc = case_when(
      dial_date <= as.Date("2022-12-31") ~ dial_date, 
      dial_date > as.Date("2022-12-31") ~ as.Date("2022-12-31"),
      is.na(dial_date) & deathc == 1 ~ ddeathc, 
      is.na(dial_date) & deathc == 0 ~ as.Date("2022-12-31"), 
      TRUE ~ as.Date(NA)
    ), 
    tdeathc = as.duration(assess_date %--% ddeathc) / dyears(1), 
    tdialc = as.duration(assess_date %--% ddialc) / dyears(1), 
    status_num = case_when(
      dialc == 0 & deathc == 0 ~ 0, 
      dialc == 1 & deathc == 0 ~ 1, #< Evento de interes: dialisis
      dialc == 0 & deathc == 1 & death_time >= 0~ 2, #< Evento en competencia (muerte antes de dialisis)
      dialc == 1 & deathc == 1 & (tdialc <= tdeathc) ~ 1, 
      TRUE ~ as.numeric(NA)
    ), 
    status_num2 = factor(status_num, levels = c(0, 1, 2), 
                         labels = c("Alive w/o Kidney Failure", 
                                    "Kidney Failure", 
                                    "Death w/o Kidney Failure")), 
    time = case_when(
      status_num == 0 ~ tdialc, 
      status_num == 1 ~ tdialc, 
      status_num == 2 ~ tdeathc, 
      TRUE ~ as.numeric(NA)
    ), 
    status_num = as.integer(status_num), 
    grf_cat = case_when(
      eGFR_ckdepi > 90 ~ "G1", 
      eGFR_ckdepi >= 60 & eGFR_ckdepi <= 90 ~ "G2", 
      eGFR_ckdepi >= 45 & eGFR_ckdepi < 60 ~ "G3a", 
      eGFR_ckdepi >= 30 & eGFR_ckdepi < 45 ~ "G3b", 
      eGFR_ckdepi >= 15 & eGFR_ckdepi < 30 ~ "G4", 
      eGFR_ckdepi < 15 ~ "G5", 
      TRUE ~ as.character(NA)
    ), 
    acr2 = urine_album / urine_crea, 
    acr_cat = case_when(
      acr < 30 ~ "A1", 
      acr >= 30 & acr <= 300 ~ "A2", 
      acr > 300 ~ "A3",
      TRUE ~ as.character(NA)
    ), 
    ckd_class = case_when(
      grf_cat %in% c("G1", "G2") & acr_cat == "A1" ~ "Low risk", 
      (grf_cat %in% c("G3a") & acr_cat == "A1") | 
        (grf_cat %in% c("G1", "G2") & acr_cat == "A2") ~ "Moderately increased risk", 
      (grf_cat %in% c("G3b") & acr_cat == "A1") | 
        (grf_cat == "G3a" & acr_cat == "A2") | 
        (grf_cat %in% c("G1", "G2") & acr_cat == "A3") ~ "High risk", 
      (grf_cat %in% c("G4", "G5") & acr_cat == "A1") | 
        (grf_cat %in% c("G3b", "G4", "G5") & acr_cat == "A2") | 
        (grf_cat %in% c("G3a", "G3b", "G4", "G5") & acr_cat == "A3") ~ "Very high risk"
    ), 
    grf_cat = factor(grf_cat, levels = c("G1", "G2", "G3a", "G3b", "G4", "G5")), 
    acr_cat = factor(acr_cat, levels = c("A1", "A2", "A3")), 
    ckd_stage = case_when(
      grf_cat %in% c("G3a", "G3b", "G4") ~ "Stages 3-4", 
      grf_cat %in% c("G1", "G2", "G5") ~ "Stages 1-2 y 5"
    ), 
    ckd_stage = factor(ckd_stage, levels = c("Stages 1-2 y 5", "Stages 3-4")), 
    ckd_stage2 = case_when(
      grf_cat %in% c("G3b", "G4") ~ "Stages 3b-4", 
      grf_cat %in% c("G3a", "G5", "G1", "G2") ~ "Stages 1-3 y 5"
    ), 
    ckd_stage2 = factor(ckd_stage2, levels = c("Stages 1-3 y 5", "Stages 3b-4")), 
    ckd_class = factor(ckd_class, 
                       levels = c("Low risk", 
                                  "Moderately increased risk", 
                                  "High risk", 
                                  "Very high risk")), 
    ckd_class2 = case_when(
      ckd_class %in% c("Low risk", "Moderately increased risk", 
                       "High risk") ~ "Moderately/High risk", 
      ckd_class == "Very high risk" ~ "Very high risk", 
      TRUE ~ as.character(NA)
    ), 
    ckd_class2 = factor(ckd_class2, 
                        levels = c("Moderately/High risk", "Very high risk")), 
    across(where(is.factor), ~droplevels(.)), 
    total = 1, 
    # Censoring to 5 years----
    eventd = case_when(
      status_num2 == "Alive w/o Kidney Failure" ~ 0, 
      status_num2 == "Kidney Failure" ~ 1, 
      status_num2 == "Death w/o Kidney Failure" ~ 2, 
      TRUE ~ as.numeric(NA)
    ), 
    event = case_when(
      status_num2 %in% c("Alive w/o Kidney Failure", "Death w/o Kidney Failure") ~ 0, 
      status_num2 %in% c("Kidney Failure") ~ 1, 
      TRUE ~ as.numeric(NA)
    ),
    time_death5y = censor.time(time, deathc, time.cens = 5)$surv.time.cens, 
    death5y = censor.time(time, deathc, time.cens = 5)$surv.event.cens, 
    time_death2y = censor.time(time, deathc, time.cens = 2)$surv.time.cens, 
    death2y = censor.time(time, deathc, time.cens = 2)$surv.event.cens, 
    time5y = censor.time(time, event, time.cens = 5)$surv.time.cens, 
    event5y = censor.time(time, event, time.cens = 5)$surv.event.cens, 
    eventd5y = censor.time(time, eventd, time.cens = 5)$surv.event.cens, 
    eventd5ylab = case_when(
      eventd5y == 0 ~ "Alive w/o Kidney Failure", 
      eventd5y == 1 ~ "Kidney Failure", 
      eventd5y == 2 ~ "Death w/o Kidney Failure", 
      TRUE ~ as.character(NA)
    ), 
    time2y = censor.time(time, event, time.cens = 2)$surv.time.cens, 
    event2y = censor.time(time, event, time.cens = 2)$surv.event.cens, 
    eventd2y = censor.time(time, eventd, time.cens = 2)$surv.event.cens, 
    eventd2ylab = case_when(
      eventd2y == 0 ~ "Alive w/o Kidney Failure", 
      eventd2y == 1 ~ "Kidney Failure", 
      eventd2y == 2 ~ "Death w/o Kidney Failure", 
      TRUE ~ as.character(NA)
    )
  ) |> 
  set_variable_labels(
    cas = "Centro de atención de salud", 
    sex = "Sexo", 
    male = "Sexo, masculino", 
    age = "Edad (años)", 
    assess_date = "Fecha de evaluación", 
    crea = "Creatinina sérica (mg/dL)", 
    eGFR_ckdepi = "TFG usando CKD-EPI, ml/min/1.73m2", 
    acr = "Relación albúmina-creatinina, mg/g", 
    urine_album = "Albúmina en orina (mg/ml)", 
    urine_crea = "Creatinina en orina (mg/dl)", 
    death_date = "Fecha de defunción", 
    dial_date = "Fecha de hemodiálisis", 
    hta = "Hipertensión", 
    dm = "Diabetes Mellitus", 
    risk2y = "Riesgo pronosticado de fallo renal a 2 años", 
    risk5y = "Riesgo pronosticado de fallo renal a 5 años", 
    pi2y = "Índice pronóstico de fallo renal a 2 años", 
    pi5y = "Índice pronóstico de fallo renal a 5 años",     
    grf_cat = "Categorías de TFG", 
    acr_cat = "Categorías de albuminuria persistente", 
    ckd_class = "Clasificación CKD KDIGO", 
    ckd_class2 = "Clasificación CKD KDIGO",
    ckd_stage = "Etapas de CKD", 
    ckd_stage2 = "Etapas de CKD", 
    status_num = "Resultado", 
    status_num2 = "Resultado", 
    eventd5ylab = "Resultado a 5 años", 
    eventd2ylab = "Resultado a 2 años", 
    eventd5y = "Resultado a 5 años", 
    deathc = "Defunción",
    death5y = "Defunción a 5 años", 
    death2y = "Defunción a 2 años",
    time_death5y = "Tiempo hasta muerte a 5 años", 
    dialc = "Fallo renal", 
    total = "Total", 
    grf_cat = "Categorias de GFR", 
    ckd_class = "Clasificación de CKD KDIGO") 

1.0.0.5 Guardar dataset completo

export(datos3, here("Data", "Tidy", "data_derived_nacional.rds"))
datos3 |> 
  count(dial)
gg_miss_upset(datos3 |> 
                select(eGFR_ckdepi, acr))

1.0.1 Fusionar Datos Reba + Datos Nacional

datos_total <- datos3 |> 
  filter(cas != "Rebagliati") |> 
  bind_rows(data_pred_reba4) |> 
  # bind_rows(data_pred_kaelin4) |> 
  select(-dni, -fuente,   -auto_cor) |> #-tipo_doc -fallecido, -fech_fallecimiento, 
  mutate(id = 1:n()) |> 
  select(id, everything()) |> 
  mutate(cas = case_when(cas == "Almenara" ~ "Lima - Almenara", 
                         cas == "Libertad" ~ "La Libertad", 
                         cas == "REBAGLIATI" ~ "Lima - Rebagliati", 
                         cas == "Red Ica" ~ "Ica", 
                         cas == "Sabogal" ~ "Lima - Sabogal",
                         TRUE ~ cas))
datos_total |> head()
datos_total |> 
  count(cas)

Recuperamos las etiquetas:

datos_total <- datos_total |>  
  set_variable_labels(
    cas = "Centro de atención de salud", 
    sex = "Sexo", 
    male = "Sexo, masculino", 
    age = "Edad (años)", 
    assess_date = "Fecha de evaluación", 
    crea = "Creatinina sérica (mg/dL)", 
    eGFR_ckdepi = "TFG usando CKD-EPI, ml/min/1.73m2", 
    acr = "Relación albúmina-creatinina, mg/g", 
    urine_album = "Albúmina en orina (mg/ml)", 
    urine_crea = "Creatinina en orina (mg/dl)", 
    death_date = "Fecha de defunción", 
    dial_date = "Fecha de hemodiálisis", 
    hta = "Hipertensión", 
    dm = "Diabetes Mellitus", 
    risk2y = "Riesgo pronosticado de fallo renal a 2 años", 
    risk5y = "Riesgo pronosticado de fallo renal a 5 años", 
    pi2y = "Índice pronóstico de fallo renal a 2 años", 
    pi5y = "Índice pronóstico de fallo renal a 5 años",     
    grf_cat = "Categorías de TFG", 
    acr_cat = "Categorías de albuminuria persistente", 
    ckd_class = "Clasificación CKD KDIGO", 
    ckd_class2 = "Clasificación CKD KDIGO",
    ckd_stage = "Etapas de CKD", 
    ckd_stage2 = "Etapas de CKD", 
    status_num = "Resultado", 
    status_num2 = "Resultado", 
    eventd5ylab = "Resultado a 5 años", 
    eventd2ylab = "Resultado a 2 años", 
    eventd5y = "Resultado a 5 años", 
    deathc = "Defunción",
    death5y = "Defunción a 5 años", 
    death2y = "Defunción a 2 años",
    time_death5y = "Tiempo hasta muerte a 5 años", 
    dialc = "Fallo renal", 
    total = "Total", 
    grf_cat = "Categorias de GFR", 
    ckd_class = "Clasificación de CKD KDIGO") 
Importante
  • No se incluyó datos del Kaelin porque el registro de muerte se basa en SINADEF y no en la data de oficina de asegurados

1.1 Guardar dataset final integrado

export(datos_total, here("Data", "Tidy", "datos_total_integrados.rds"))
export(datos_total, here("Data", "Tidy", "datos_total_integrados.xlsx"))
export(datos_total, here("Data", "Tidy", "datos_total_integrados.dta"))

1.2 Ticket de Reproducibilidad

sessionInfo()
R version 4.3.3 (2024-02-29 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 11 x64 (build 22631)

Matrix products: default


locale:
[1] LC_COLLATE=Spanish_Peru.utf8  LC_CTYPE=Spanish_Peru.utf8   
[3] LC_MONETARY=Spanish_Peru.utf8 LC_NUMERIC=C                 
[5] LC_TIME=Spanish_Peru.utf8    

time zone: America/Lima
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] labelled_2.12.0     summarytools_1.0.1  naniar_1.1.0       
 [4] janitor_2.2.0       rio_1.0.1           lubridate_1.9.3    
 [7] forcats_1.0.0       stringr_1.5.1       dplyr_1.1.4        
[10] purrr_1.0.2         readr_2.1.5         tidyr_1.3.1        
[13] tibble_3.2.1        ggplot2_3.5.0       tidyverse_2.0.0    
[16] here_1.0.1          pacman_0.5.1        survcomp_1.52.0    
[19] prodlim_2023.08.28  survival_3.5-8      BiocManager_1.30.22

loaded via a namespace (and not attached):
  [1] gridExtra_2.3       writexl_1.5.0       tcltk_4.3.3        
  [4] readxl_1.4.3        rlang_1.1.3         magrittr_2.0.3     
  [7] snakecase_0.11.1    e1071_1.7-14        matrixStats_1.2.0  
 [10] compiler_4.3.3      vctrs_0.6.5         reshape2_1.4.4     
 [13] pkgconfig_2.0.3     fastmap_1.1.1       backports_1.4.1    
 [16] magick_2.8.3        labeling_0.4.3      pander_0.6.5       
 [19] utf8_1.2.4          rmarkdown_2.26      tzdb_0.4.0         
 [22] haven_2.5.4         visdat_0.6.0        UpSetR_1.4.0       
 [25] xfun_0.43           jsonlite_1.8.8      SuppDists_1.1-9.7  
 [28] pryr_0.1.6          cluster_2.1.6       parallel_4.3.3     
 [31] R6_2.5.1            vcd_1.4-12          stringi_1.8.3      
 [34] ranger_0.16.0       boot_1.3-30         car_3.1-2          
 [37] rpart_4.1.23        parallelly_1.37.1   lmtest_0.9-40      
 [40] cellranger_1.1.0    Rcpp_1.0.12         knitr_1.45         
 [43] future.apply_1.11.2 zoo_1.8-12          base64enc_0.1-3    
 [46] R.utils_2.12.3      nnet_7.3-19         survivalROC_1.0.3.1
 [49] Matrix_1.6-5        splines_4.3.3       timechange_0.3.0   
 [52] tidyselect_1.2.1    abind_1.4-5         rstudioapi_0.16.0  
 [55] yaml_2.3.8          codetools_0.2-19    listenv_0.9.1      
 [58] lattice_0.22-5      plyr_1.8.9          withr_3.0.0        
 [61] evaluate_0.23       foreign_0.8-86      future_1.33.2      
 [64] proxy_0.4-27        pillar_1.9.0        carData_3.0-5      
 [67] KernSmooth_2.23-22  VIM_6.2.2           checkmate_2.3.1    
 [70] generics_0.1.3      sp_2.1-3            rprojroot_2.0.4    
 [73] hms_1.1.3           munsell_0.5.0       scales_1.3.0       
 [76] laeken_0.5.3        globals_0.16.3      class_7.3-22       
 [79] glue_1.7.0          bootstrap_2019.6    Hmisc_5.1-2        
 [82] tools_4.3.3         robustbase_0.99-2   data.table_1.15.4  
 [85] rapportools_1.1     grid_4.3.3          colorspace_2.1-0   
 [88] repr_1.1.7          htmlTable_2.4.2     Formula_1.2-5      
 [91] cli_3.6.2           fansi_1.0.6         lava_1.8.0         
 [94] DEoptimR_1.1-3      gtable_0.3.4        R.methodsS3_1.8.2  
 [97] digest_0.6.35       htmlwidgets_1.6.4   skimr_2.1.5        
[100] farver_2.1.1        htmltools_0.5.8     R.oo_1.26.0        
[103] lifecycle_1.0.4     rmeta_3.0           MASS_7.3-60.0.1